This markdown describes the analysis of RNA-seq data to accompany the manuscript entitled "Muscle Transcriptional Networks Underlying Hypertrophic Response Heterogeneity".
The general workflow is as follows (Figure 1).
include_graphics("../2_pipeline/Figure_1.pdf")
For replication purposes:
r <- R.Version()
r$version.string
## [1] "R version 4.0.2 (2020-06-22)"
All samples were aligned to genome in STAR and converted to featureCounts in R.
load("../../0_data/1_featureCounts/updated_MASTERS_cts.Rdata")
load("../../0_data/3_gene_data/gene_data.Rdata")
Subjects are retained if they have both CT (primary outcome) and DXA data. These data sets were stored and organized independently, so we need to do some work to bring them together and fit them the the 31 participants for whom we have baseline RNA-seq.
MASTERSpheno <- read.csv("../../0_data/2_phenotype/MASTERSphenofile.csv")
MASTERSpheno <- MASTERSpheno %>%
mutate(PID = str_extract(SubjectID, pattern = "(?<=MAS)[:digit:]{3}|(?<=MET)[:digit:]{2}")) %>%
mutate(SeqID_w0 = str_c("P",PID,"1",sep="_"),
SeqID_w16 = str_c("P",PID,"3",sep="_"),
SeqID_w0 = ifelse(nchar(SeqID_w0)==6,gsub(pattern = "P_0", replacement = "P_", SeqID_w0),SeqID_w0),
SeqID_w16 = ifelse(nchar(SeqID_w16)==6,gsub(pattern = "P_0", replacement = "P_", SeqID_w16),SeqID_w16),
SeqID = ifelse(SubjectID=="UKMET05","05_1",SeqID),
SeqID = ifelse(SubjectID=="UKMET08","08_1",SeqID)) %>%
filter((SeqID_w0 %in% names(cts.updated) & SeqID_w16 %in% names(cts.updated))|(SeqID_w0 %in% names(cts.updated)))
MASTERSpheno
working.pheno.long <- MASTERSpheno %>%
dplyr::select(-SeqID) %>%
pivot_longer(cols = contains("SeqID"), names_to = NULL, values_to = "SeqID") %>%
filter(SeqID %in% names(cts.updated))
load(file = "../../0_data/2_phenotype/final_rnaseq_phenotype.RData")
anti_join(MASTERSpheno,final_rnaseq_phenotype, by="SubjectID") #check that all are there
DEXA <- left_join(MASTERSpheno,final_rnaseq_phenotype, by="SubjectID") %>%
mutate(delta_bilateral_thigh_normalized = bilateral_thigh_mass_normalized_w16 - bilateral_thigh_mass_normalized_w0,
delta_bilateral_thigh = bilateral_thigh_mass_w16 - bilateral_thigh_mass_w0,
percent_delta_bilateral_thigh =
(bilateral_thigh_mass_w16 - bilateral_thigh_mass_w0)/bilateral_thigh_mass_w0 * 100)
DEXA
#bring in CT and drop MET group
MASTERS_CT <- read.csv(file="../../0_data/2_phenotype/CT_data_for_UAB_09_18_2019.csv",
header=TRUE, stringsAsFactors = FALSE)
MASTERS_CT <- MASTERS_CT %>%
#filter(group_placement==" PLA") %>% #40
rename("SubjectID" = "study_id")
#create naming convention
df_id <- as.data.frame(names(MASTERS_CT)) %>%
rename("old"= "names(MASTERS_CT)") %>%
mutate("new" = ifelse(str_detect(old, "^wk16_"),str_c(old, "_w16",sep=""),str_c(old,"_w0",sep="")),
new = str_to_lower(gsub(pattern = "^wk16_", replacement = "", new)),
new = gsub("^\\.","", new))
CT.clean <- MASTERS_CT %>%
rename_with(~str_replace_all(., df_id$old, df_id$new)) %>%
rename("SubjectID"= "subjectid_w0") %>%
dplyr::select(-group_placement_w0)
CT.clean
MASTERSpheno <- full_join(DEXA, CT.clean, by="SubjectID") %>%
filter(!is.na(d_area_w0)) %>% #filter out those who have no CT data
filter((SeqID_w0 %in% names(cts.updated) & SeqID_w16 %in% names(cts.updated))|
(SeqID_w0 %in% names(cts.updated))) #drop subj with no seq data
IHCtemp <- read.csv(file="../../0_data/2_phenotype/IHC_cleaned.csv")
ihc.seq <- IHCtemp %>%
mutate(SubjectID = gsub("UABMAS","P_",SubjectID),
SubjectID = gsub("UKMET0","P_",SubjectID), #get low numbers too
SubjectID = gsub("UKMET","P_",SubjectID)) %>%
unite(SeqID, c(SubjectID, Biopsy), sep="_", remove = FALSE, na.rm = TRUE) %>%
filter(SeqID %in% names(cts.updated))
ihc.summary <- ihc.seq %>%
dplyr::select(SeqID, Biopsy, contains("Percent"))
ihc.summary <- ihc.seq %>%
mutate(TypeI_percent = round(X_Cy5_Percent*100,0),
TypeIIa_percent = round(X_EGFP_Percent*100,0),
TypeIIx_percent = round(X_Texas.Red_Percent*100,0),
Hybrid_percent = round(Hybrids_Percent*100,0),
TypeIItot_percent = TypeIIa_percent + TypeIIx_percent + Hybrid_percent) %>%
dplyr::select(SeqID, contains("Type"), Hybrid_percent,contains("csa"),-contains("%")) %>%
rename("TypeI_CSA" = "X_Cy5_csa",
"TypeIIa_CSA" = "X_EGFP_csa",
"TypeIIx_CSA" = "X_Texas.Red_csa",
"Negatives_CSA" = "Negatives_csa",
"Hybrids_CSA" = "Hybrids_csa")
mean(ihc.seq$Fiber_Count) #890.4151
## [1] 890.4151
sd(ihc.seq$Fiber_Count) #496.6494
## [1] 496.6494
We need to convert the phenotype file to long format in order to create the design matrix that will be used to do the batch adjustment.
pheno1 <- MASTERSpheno %>%
filter(SeqID_w0 %in% names(cts.updated)) %>% #31 basal
dplyr::select(SubjectID, Site, Age, Sex, Treatment, contains("w0")) %>%
rename("avg_hu_w0"="avg_hu_baseline_w0")
names(pheno1)[str_detect(names(pheno1),"_w0")] <- gsub("_w0","",names(pheno1[str_detect(names(pheno1),"w0")]))
pheno3 <- MASTERSpheno %>%
filter(SeqID_w16 %in% names(cts.updated)) %>% #22 both
dplyr::select(SubjectID, Site, Age, Sex, Treatment, contains("w16")) %>%
rename("skm_area_r_w16" ="avg_area_r_w16")
names(pheno3)[str_detect(names(pheno3),"_w16")] <- gsub("_w16","",names(pheno3[str_detect(names(pheno3),"w16")]))
pheno3 <- pheno3 %>%
dplyr::select(any_of(names(pheno1))) #no deltas that's okay
pheno.long <- bind_rows(pheno1, pheno3) %>%
rename("Batch" = "Site") %>%
mutate(Batch = ifelse(SubjectID ==("UKMET05"),3,Batch), #mark that new ones are in different batch
Batch = ifelse(SubjectID ==("UKMET08"),3,Batch)) %>%
mutate("Timepoint" = str_sub(SeqID, -1)) %>%
relocate(c(SeqID, Timepoint), .after = SubjectID)
rownames(pheno.long) <- pheno.long$SeqID
dim(ihc.summary)
## [1] 53 13
dim(pheno.long)
## [1] 53 37
pheno.long <- full_join(pheno.long, ihc.summary, by="SeqID")
pheno.long
Participant Characteristics appear in Table 1.
x <- pheno.long %>%
dplyr::select(Timepoint,Sex,Batch,body_fat_percent,bilateral_thigh_mass_normalized,
avg_area,Avg_CSA,TypeIIa_CSA,avg_norm,avg_low,avg_hu) %>%
mutate(Sex = as.factor(Sex), Timepoint = as.factor(Timepoint)) %>%
#we don't have BMI or height and weight in here, so that is not shown
mutate(Batch = as.factor(ifelse(Batch==3,2,Batch)))
numeric_cols <- map_chr(x, is.numeric)
to_mutate <- names(numeric_cols[numeric_cols ==TRUE])
all(to_mutate %in% names(x))
## [1] TRUE
a <- x %>%
group_by(Timepoint) %>%
summarise(n_female = length(Sex[Sex =="1"]),
n_male = length(Sex[Sex =="2"]))
b <- x %>% #apparently this has to be done separately because Batch gets lost when it groups by time
group_by(Timepoint) %>%
summarise(n_UAB = length(Batch[Batch =="1"]),
n_UK = length(Batch[Batch =="2"]))
c <- x %>%
group_by(Timepoint) %>%
summarise(across(.cols = to_mutate,
.fns = c(mean, sd)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(to_mutate)` instead of `to_mutate` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
df_id <- as.data.frame(names(c)) %>%
rename(old = "names(c)") %>%
mutate(new = gsub(pattern = "_1","_mean",old)) %>%
mutate(new = gsub(pattern = "_2","_sd",new))
c <- c %>%
rename_with(~str_replace_all(., df_id$old, df_id$new))
table_1 <- cbind(a,b,c) %>%
dplyr::select(unique(c(names(a),names(b),names(c))))
table_1
We also report RIN scores, which were collected and stored in a separate data file.
RINscores <- readxl::read_xlsx("../../0_data/2_phenotype/RIN summary.xlsx")
RINscores <- RINscores %>%
filter(str_detect(ID, "^U"))
all(RINscores$ID %in% pheno.long$SubjectID) #these are the same as the samples we have
## [1] TRUE
RINscores %>%
filter(str_detect(ID, "^U")) %>%
summarise(RIN_mean = round(mean(as.numeric(RIN), na.rm = TRUE),1),
RIN_sd = round(sd(as.numeric(RIN), na.rm = TRUE),1))
## Warning in mean(as.numeric(RIN), na.rm = TRUE): NAs introduced by coercion
## Warning in sd(as.numeric(RIN), na.rm = TRUE): NAs introduced by coercion
#pheno.wide1 <- adj.pheno.long %>% filter(str_detect(SeqID,"_1$"))
#names(pheno.wide1)[2:40] <- str_c(names(pheno.wide1[2:40]),"_w0")
#pheno.wide3 <- adj.pheno.long %>% filter(str_detect(SeqID,"_3$"))
#names(pheno.wide3)[2:40] <- str_c(names(pheno.wide3[2:40]),"_w16")
pheno.wide <- MASTERSpheno
rownames(pheno.wide) <- pheno.wide$SeqID_w0
ihc <- pheno.long %>%
dplyr::select("SubjectID","SeqID", contains("TypeI"), "Hybrid_percent", contains("CSA")) %>%
mutate(TimePoint = gsub(".*_","",SeqID),
TimePoint = ifelse(TimePoint=="1","w0","w16")) %>%
pivot_wider(id_cols = SubjectID, names_from = TimePoint,
values_from = c(contains("TypeI"), "Hybrid_percent", contains("CSA")))
x <- full_join(MASTERSpheno, ihc, by = "SubjectID") %>%
rename("Batch"="Site")
#filter pheno for pre only
pheno.wide <- x %>%
mutate(Sex = as.numeric(as.character(Sex)),
Batch = as.numeric(as.character(Batch)),
d_CT_area = avg_area_w16 - avg_area_w0,
d_myofiber_CSA = Avg_CSA_w16 - Avg_CSA_w0)
pheno.wide.fiber <- pheno.wide %>%
mutate(d_typeI_CSA = TypeI_CSA_w16-TypeI_CSA_w0,
d_typeIIa_CSA = TypeIIa_CSA_w16-TypeIIa_CSA_w0,
d_typeIIx_CSA = TypeIIx_CSA_w16-TypeIIx_CSA_w0,
d_hybrid_CSA = Hybrids_CSA_w16-Hybrids_CSA_w0,
d_negative_CSA = Negatives_CSA_w16-Negatives_CSA_w0)
As a general rule of thumb, figures and tables are numbered based on the order of their presentation in the manuscript, despite where in the workflow they are generated. We have numbered things clearly in order to make it easier to trace where they came from in the code.
We are interested in showing the range of heterogeneity as measured by CT. Here we simply show this using waterfall plot by ordering the subjects based on the degree of hypertrophy as measured by CT.
pheno.wide.fiber %>%
filter(!is.na(d_area_w0)) %>%
arrange(d_area_w0) %>%
mutate(position_rank = c(1:31)) %>%
ggplot()+geom_col(mapping = aes(x = position_rank, y = d_area_w0)) +
theme_linedraw() +
labs(x="Participant Rank", y = "Change in Thigh Muscle Area (CT)")
The same is performed for the other outcomes and presented in Figures S3A and B.
#Figure S3A
pheno.wide.fiber %>%
filter(!is.na(delta_bilateral_thigh_normalized)) %>%
arrange(delta_bilateral_thigh_normalized) %>%
mutate(position_rank = c(1:31)) %>%
ggplot +
geom_col(mapping = aes(x = position_rank, y = delta_bilateral_thigh_normalized)) +
theme_linedraw() +
labs(x="Participant Rank", y = "Change in Normalized Bilateral Thigh Lean Mass (DXA)")
#Figure S3B
pheno.wide.fiber %>%
filter(!is.na(d_myofiber_CSA)) %>%
arrange(d_myofiber_CSA) %>%
mutate(position_rank = c(1:22)) %>%
ggplot + geom_col(mapping = aes(x = position_rank, y = d_myofiber_CSA)) +
theme_linedraw() +
labs(x="Participant Rank", y = "Change in Myofiber Cross-Sectional Area (µm2))")
The adjustment using residuals varies based on what else is in the data set. In order not to have irrelevant samples influence the residuals for a given subset of samples, we perform the adjustment with all three sets of data: all, basal only, and paired only.
#calculate the residuals with all 53 samples
adj.pheno.long <- pheno.long %>%
dplyr::select(SubjectID, SeqID, Batch, Age, Sex, everything(), -Treatment)
columns <- c(7:ncol(adj.pheno.long)) ##MANUAL CHANGE
for (i in (1:length(columns))){
j <- columns[i]
variable <- names(adj.pheno.long[j])
pDat.clean_noNA = adj.pheno.long[which(!is.na(adj.pheno.long[,variable])),]
adjvar = glm((pDat.clean_noNA[,variable]) ~ pDat.clean_noNA$Sex, family = "gaussian")
summary(adjvar)
pDat.clean_noNA[,variable] = adjvar$residuals
x <- paste("adj",variable, sep = "_")
names(pDat.clean_noNA)[names(pDat.clean_noNA)==variable] <- x
adj.pheno.long <- left_join(adj.pheno.long, pDat.clean_noNA[,c("SeqID",x)], by = "SeqID")
}
adj.pheno.long <- adj.pheno.long %>%
dplyr::select(SubjectID, SeqID, Batch, Age, contains("adj_"))
rownames(adj.pheno.long) <- adj.pheno.long$SeqID
adj.pheno.long <- adj.pheno.long %>%
mutate(Batch = as.factor(Batch))
We can illustrate this graphically. This is presented in Figure S1:
#load colors to make it look similar to the ggplot output
hex_codes1 <- hue_pal()(2)
hex_codes1
## [1] "#F8766D" "#00BFC4"
#subset to numeric
y <- names(pheno.long)[map_chr(pheno.long, is.numeric)==TRUE]
find.outliers <- pheno.long %>%
column_to_rownames("SeqID") %>%
dplyr::select(y)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(y)` instead of `y` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
transposed <- t(find.outliers)
exp.outlier <- find.outliers
# sample network based on squared Euclidean distance
A.ecl <- adjacency(t(exp.outlier),type="distance")
# this calculates the whole network connectivity
k.ecl <- as.numeric(apply(A.ecl,2,sum))-1 #add columns of A, subtracts relationship to itself
# standardized connectivity
Z.k.ecl <- scale(k.ecl) #scale values
# designate samples as outlying if their Z.k.ecl value is below the threshold
thresholdZ.k.ecl <- -1.5
# the color vector indicates outlyingness (red)
outlierColor.ecl <- ifelse(Z.k.ecl<thresholdZ.k.ecl,"red","black")
# calculate the cluster tree using flahsClust or hclust
sampleTree.ecl <- hclust(as.dist(1-A.ecl), method = "average")
hcd_pheno <- as.dendrogram(sampleTree.ecl)
key <- as.data.frame(get_nodes_attr(hcd_pheno, "label", na.rm=TRUE))
names(key) = "ID"
key <- key %>%
mutate(sex = pheno.long$Sex[match(ID, pheno.long$SeqID)]) %>%
mutate(colorkey = ifelse(sex == 2, hex_codes1[2],(ifelse(sex==1, hex_codes1[1],NA))))
labels_colors(hcd_pheno) <- key$colorkey
x <- labels_colors(hcd_pheno)
all(names(x)[x==hex_codes1[1]] %in% pheno.long$SeqID[pheno.long$Sex=="1"])
## [1] TRUE
all(names(x)[x==hex_codes1[2]] %in% pheno.long$SeqID[pheno.long$Sex=="2"])
## [1] TRUE
plot(hcd_pheno, type = "rectangle", ylab = "Height")
An obvious split is seen, but after the adjustment:
for_tree <- adj.pheno.long
#subset to numeric
z <- names(adj.pheno.long)[map_chr(for_tree, is.numeric)==TRUE]
find.outliers <- for_tree %>%
dplyr::select(z)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(z)` instead of `z` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
transposed <- t(find.outliers)
exp.outlier <- find.outliers
# sample network based on squared Euclidean distance
A.ecl <- adjacency(t(exp.outlier),type="distance")
# this calculates the whole network connectivity
k.ecl <- as.numeric(apply(A.ecl,2,sum))-1 #add columns of A, subtracts relationship to itself
# standardized connectivity
Z.k.ecl <- scale(k.ecl) #scale values
# designate samples as outlying if their Z.k.ecl value is below the threshold
thresholdZ.k.ecl <- -1.5
# the color vector indicates outlyingness (red)
outlierColor.ecl <- ifelse(Z.k.ecl<thresholdZ.k.ecl,"red","black")
# calculate the cluster tree using flahsClust or hclust
adj.sampleTree.ecl <- hclust(as.dist(1-A.ecl), method = "average")
hcd_pheno_adj <- as.dendrogram(adj.sampleTree.ecl)
key <- as.data.frame(get_nodes_attr(hcd_pheno_adj, "label", na.rm=TRUE))
names(key) = "ID"
key <- key %>%
mutate(sex = pheno.long$Sex[match(ID, pheno.long$SeqID)]) %>%
mutate(colorkey = ifelse(sex == 2, hex_codes1[2],(ifelse(sex==1, hex_codes1[1],NA))))
labels_colors(hcd_pheno_adj) <- key$colorkey
x <- labels_colors(hcd_pheno_adj)
all(names(x)[x==hex_codes1[1]] %in% pheno.long$SeqID[pheno.long$Sex=="1"])
## [1] TRUE
all(names(x)[x==hex_codes1[2]] %in% pheno.long$SeqID[pheno.long$Sex=="2"])
## [1] TRUE
plot(hcd_pheno_adj, type = "rectangle", ylab = "Height")
pheno.long %>%
ggplot +
geom_histogram(mapping = aes(x=avg_area, fill = factor(Sex)), binwidth = 20) +
ggtitle("CT")
adj.pheno.long %>%
mutate(Sex = pheno.long$Sex[match(SeqID, pheno.long$SeqID)]) %>%
ggplot +
geom_histogram(mapping = aes(x=adj_avg_area, fill = factor(Sex)), binwidth = 20) +
ggtitle("CT")
y <- pheno.wide.fiber %>%
dplyr::select(SeqID, contains("body_fat_percent"), contains("bilateral_thigh_mass_normalized"),
contains("avg_area_w"), contains("Avg_CSA"), starts_with("TypeIIa_CSA"),
contains("avg_norm"), contains("avg_low"), contains("avg_hu")) %>%
column_to_rownames(var = "SeqID")
subjecttable <- as.data.frame(names(y)) %>%
rename(parameter = "names(y)") %>%
mutate(mean = NA, sd = NA, ttest = NA)
for (i in 1:(ncol(y)-1)){
subjecttable$mean[i] = round(mean(y[,i], na.rm=TRUE),1)
subjecttable$sd[i] = round(sd(y[,i], na.rm=TRUE),1)
res <- t.test( y[,i],y[,i+1],paired = TRUE)
subjecttable$ttest[i] = ifelse(i%%2!=0, res$p.value,
subjecttable$ttest[i])
} #fails at end because there is no way to add a column at end but it's okay, we get what we need
subjecttable$mean[16] = round(mean(y[,16], na.rm=TRUE),1)
subjecttable$sd[16] = round(sd(y[,16], na.rm=TRUE),1)
subjecttable %>%
mutate(ttest = round(ttest,2))
As described above, residuals will also be calculated on the baseline only and paired only samples, due to the undue influence of unrelated numbers in the data set on the outcomes. The same is done for the gene expression data in the ComBat adjustment sections below.
resid.pheno.wide <- pheno.wide.fiber %>%
dplyr::select(SubjectID, SeqID, Batch, Age, Sex, everything(), -Treatment,-starts_with("X.."))
columns <- c(9:ncol(resid.pheno.wide)) ##MANUAL CHANGE
#columns <- 82
for (i in (1:length(columns))){
j <- columns[i]
variable <- names(resid.pheno.wide[j])
pDat.clean_noNA = resid.pheno.wide[which(!is.na(resid.pheno.wide[,variable])),]
adjvar = glm((pDat.clean_noNA[,variable]) ~ pDat.clean_noNA$Sex, family = "gaussian")
summary(adjvar)
pDat.clean_noNA[,variable] = adjvar$residuals
x <- paste("adj",variable, sep = "_")
names(pDat.clean_noNA)[names(pDat.clean_noNA)==variable] <- x
resid.pheno.wide <- merge(resid.pheno.wide, pDat.clean_noNA[,c("SubjectID",x)], by = "SubjectID", all.x = TRUE)
}
resid.pheno.wide <- resid.pheno.wide %>%
dplyr::select(SubjectID, SeqID_w0, Batch, Age, contains("adj_")) %>%
mutate(Batch = as.numeric(as.character(Batch)))
pairedkey <- summary(as.factor(str_sub(pheno.long$SeqID,0,-3)))
keep <- names(pairedkey)[pairedkey=="2"]
pheno.long.paired <- pheno.long %>%
filter(str_sub(SeqID,0,-3) %in% keep)
resid.pheno.paired <- pheno.long.paired %>%
dplyr::select("SubjectID","SeqID","Timepoint","Batch","Age","Sex", everything())
columns <- c(8:ncol(pheno.long.paired)) ##MANUAL CHANGE
for (i in (1:length(columns))){
j <- columns[i]
variable <- names(resid.pheno.paired[j])
pDat.clean_noNA = resid.pheno.paired[which(!is.na(resid.pheno.paired[,variable])),]
adjvar = glm((pDat.clean_noNA[,variable]) ~ pDat.clean_noNA$Sex, family = "gaussian")
summary(adjvar)
pDat.clean_noNA[,variable] = adjvar$residuals
x <- paste("adj",variable, sep = "_")
names(pDat.clean_noNA)[names(pDat.clean_noNA)==variable] <- x
resid.pheno.paired <- merge(resid.pheno.paired, pDat.clean_noNA[,c("SeqID",x)], by = "SeqID", all.x = TRUE)
}
resid.pheno.paired <- resid.pheno.paired %>%
dplyr::select("SubjectID","SeqID","Timepoint","Batch","Age","Sex","Treatment",contains("adj_")) %>%
column_to_rownames(var = "SeqID")
cts <- cts.updated %>%
dplyr::select(pheno.long$SeqID)
cts.mat <- as.matrix(cts)
rownames(pheno.long) <- pheno.long$SeqID
all(names(cts.mat) == rownames(pheno.long))
## [1] TRUE
dim(cts.mat)
## [1] 58721 53
For PLIER purposes, we will perform the batch adjustment on all samples together.
batch <- pheno.long$Batch
covar.mod <- model.matrix(~as.factor(Sex), data = pheno.long) #establishes reference
#combat_adj <- ComBat_seq(cts.mat, batch = batch, group = NULL, covar_mod = covar.mod)
load(file = "../../2_pipeline/1_MASTERS_WGCNA/out/ComBat_test.Rdata")
dim(combat_adj)
## [1] 58721 53
Below we clean and filter gene data in order to plot the sample dendrogram, Figure 2.
cts.adj <- as.data.frame(combat_adj)
#For this function we use rowSums to determine characteristics of the gene (i.e., row)
genesums <- vector(length=nrow(cts.adj))
for (i in 1:length(genesums)){
genesums[i] <- rowSums(cts.adj[i,])
}
above0 <- genesums > 0
length(above0[above0=="TRUE"])
## [1] 40258
filtered <- cts.adj[above0,]
max(colSums(filtered))/min(colSums(filtered))
## [1] 4.090651
filtered <- as.matrix(filtered)
blindvst <- varianceStabilizingTransformation(round(filtered), blind=TRUE, fitType = "parametric")
## converting counts to integer mode
max(colSums(blindvst))/min(colSums(blindvst))
## [1] 1.009385
dim(blindvst)
## [1] 40258 53
options(stringsAsFactors = FALSE)
dim(blindvst)
## [1] 40258 53
vstExpr0 = as.data.frame(t(blindvst))
colnames(vstExpr0) = rownames(blindvst)
rownames(vstExpr0) = colnames(blindvst)
dim(vstExpr0)
## [1] 53 40258
gsg = goodSamplesGenes(vstExpr0, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
if (!gsg$allOK)
{
full_expr = vstExpr0[gsg$goodSamples, gsg$goodGenes]
}
if (gsg$allOK)
{
full_expr = vstExpr0
}
dim(full_expr)
## [1] 53 40258
all.equal(rownames(full_expr),rownames(vstExpr0)) #check they all stayed
## [1] TRUE
sampleTree = hclust(dist(full_expr), method = "average")
plot(sampleTree, main = "Figure S2", sub="", xlab="", cex.lab = 1,
cex.axis = 1, cex.main = 1)
#dev.copy2pdf(file = "../../3_output/1_MASTERS_WGCNA/Dendrograms/AllSamples.pdf", width = 12, height = 9);
For WGCNA Prediction and WGCNA Plasticity, we will perform batch adjustment on the appropriate data separately.
cts_basal <- cts.updated[str_detect(names(cts.updated),"_1$")] #end in 1
cts_basal <- cts_basal %>%
dplyr::select(pheno.wide$SeqID_w0) #put in right order
cts_basal.mat <- as.matrix(cts_basal)
rownames(pheno.wide) <- pheno.wide$SeqID_w0
all(names(cts_basal.mat) == rownames(pheno.wide))
## [1] TRUE
dim(cts_basal.mat)
## [1] 58721 31
pheno.wide <- pheno.wide %>%
mutate(SeqID = ifelse(SubjectID == "UKMET05","5_1",SeqID),
SeqID = ifelse(SubjectID == "UKMET08","8_1",SeqID))
batch.basal <- pheno.wide$Batch
covar.mod.basal <- model.matrix(~as.factor(Sex), data = pheno.wide) #establishes reference
#basal_combat_adj <- ComBat_seq(cts_basal.mat, batch = batch.basal, group = NULL, covar_mod = covar.mod.basal)
dim(basal_combat_adj)
## [1] 58721 31
basal_cts.adj <- as.data.frame(basal_combat_adj)
#For this function we use rowSums to determine characteristics of the gene (i.e., row)
genesums <- vector(length=nrow(basal_cts.adj))
for (i in 1:length(genesums)){
genesums[i] <- rowSums(basal_cts.adj[i,])
}
above0 <- genesums > 0
length(above0[above0=="TRUE"])
## [1] 37891
filt_basal <- basal_cts.adj[above0,]
max(colSums(filt_basal))/min(colSums(filt_basal))
## [1] 4.087136
This reduces the number of genes from 58721 to 37891, but there is still a 4.1-fold difference in library size.
filt_basal <- as.matrix(filt_basal)
basal_set <- varianceStabilizingTransformation(round(filt_basal),
blind=TRUE, fitType = "parametric")
## converting counts to integer mode
max(colSums(basal_set))/min(colSums(basal_set))
## [1] 1.008464
dim(basal_set)
## [1] 37891 31
options(stringsAsFactors = FALSE)
dim(basal_set)
## [1] 37891 31
basal_expr = as.data.frame(t(basal_set))
colnames(basal_expr) = rownames(basal_set)
rownames(basal_expr) = colnames(basal_set)
dim(basal_expr)
## [1] 31 37891
Now we can clean the data set by removing genes that don't have detectable expression.
gsg = goodSamplesGenes(basal_expr, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
if (!gsg$allOK)
{
basal_expr = basal_expr[gsg$goodSamples, gsg$goodGenes]
}
if (gsg$allOK)
{
basal_expr = basal_expr
}
dim(basal_expr)
## [1] 31 37891
#ID people with both timepoints
cts_paired <- cts[,names(cts) %in% pheno.long.paired$SeqID]
cts_paired <- cts_paired %>%
dplyr::select(pheno.long.paired$SeqID) #put in right order
cts_paired.mat <- as.matrix(cts_paired)
rownames(pheno.long.paired) <- pheno.long.paired$SeqID
all(names(cts_paired.mat) == rownames(pheno.long.paired))
## [1] TRUE
dim(cts_paired.mat)
## [1] 58721 44
batch.paired <- pheno.long.paired$Batch
covar.mod.paired <- model.matrix(~as.factor(Sex), data = pheno.long.paired) #establishes reference
#paired_combat_adj <- ComBat_seq(cts_paired.mat, batch = batch.paired, group = NULL, covar_mod = covar.mod.paired)
dim(paired_combat_adj)
## [1] 58721 44
paired_cts.adj <- as.data.frame(paired_combat_adj)
genesums <- vector(length=nrow(paired_cts.adj))
for (i in 1:length(genesums)){
genesums[i] <- rowSums(paired_cts.adj[i,])
}
above0 <- genesums > 0
length(above0[above0=="TRUE"])
## [1] 39730
filt_paired <- paired_cts.adj[above0,]
max(colSums(filt_paired))/min(colSums(filt_paired))
## [1] 2.818036
This reduces the number of genes from 58721 to 39730, but there is still a 2.8-fold difference in library size.
Out of curiosity, how many are the same in the lists?
nrow(filt_paired[rownames(filt_paired) %in% rownames(filt_basal),])
## [1] 37363
filt_paired <- as.matrix(filt_paired)
paired_set <- varianceStabilizingTransformation(round(filt_paired), blind=TRUE,
fitType = "parametric")
## converting counts to integer mode
max(colSums(paired_set))/min(colSums(paired_set))
## [1] 1.008571
dim(paired_set)
## [1] 39730 44
options(stringsAsFactors = FALSE)
dim(paired_set)
## [1] 39730 44
paired_expr = as.data.frame(t(paired_set))
colnames(paired_expr) = rownames(paired_set)
rownames(paired_expr) = colnames(paired_set)
dim(paired_expr)
## [1] 44 39730
Now we can clean the data set by removing genes that don't have detectable expression.
gsg = goodSamplesGenes(paired_expr, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
if (!gsg$allOK)
{
paired_expr = paired_expr[gsg$goodSamples, gsg$goodGenes]
}
if (gsg$allOK)
{
paired_expr = paired_expr
}
dim(paired_expr)
## [1] 44 39730
all.equal(rownames(paired_expr),rownames(paired_expr))
## [1] TRUE
Soft-thresholding power was determined to be 12 for both paired and baseline networks based on code below.
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(basal_expr, powerVector = powers, verbose = 5)
#Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=0.9,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=0.9,col="red")
We will apply biweight mid correlation to reduce the influence of outliers.
basal_network <- blockwiseModules(basal_expr, power = 12, maxBlockSize = 40000,
deepSplit = 4, pamStage = TRUE,
corType = "bicor",
networkType = "signed",
minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE,
saveTOMs = FALSE,
saveTOMFileBase = "TOMS",
verbose = 3)
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(paired_expr, powerVector = powers, verbose = 5)
#Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=0.9,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=0.9,col="red")
paired_network <- blockwiseModules(paired_expr, power = 12, maxBlockSize = 40300,
deepSplit = 4, pamStage = TRUE,
corType = "bicor",
networkType = "signed",
minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE,
saveTOMs = FALSE,
saveTOMFileBase = "TOMS",
verbose = 3)
Unlike WGCNA, PLIER is run across all samples. See Figure 1 and/or text for details.
options(stringsAsFactors=FALSE)
expression_data <- combat_adj #use batch adjusted data
dim(expression_data)
## [1] 58721 53
head(colnames(expression_data)) #make sure that these are the sample names
## [1] "P_008_1" "P_012_1" "P_015_1" "P_048_1" "P_080_1" "P_081_1"
Provide phenotype data.
subject_data <- pheno.long
subject_data <- subject_data %>%
mutate(Timepoint = as.factor(Timepoint),
Batch = as.factor(Batch),
Sex = as.factor(Sex))
#check to make sure that it is compatible with the gene expression data.
all(subject_data$SeqID %in% colnames(expression_data))
## [1] TRUE
dge <- DGEList(counts = expression_data)
# filtering and normalization}
keep <- rowSums(cpm(dge) > 0.5) >= 2
dge <- dge[keep, , keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)
# obtain z-score}
z <- cpm(dge, log=TRUE)
dim(z)
## [1] 17717 53
z.means <- rowMeans(z)
rownames(gene_df) <- gene_df$link
gene <- gene_df[rownames(z),]
#map from ensembl ID to gene name
#this function applies a function to each item in 1:nrow(z) to find the location of the maximum value
ensembl_to_gene <- tapply(1:nrow(z),gene$gene_name,function(i){
i[which.max(z.means[i])]})
y <- z[ensembl_to_gene,]
rownames(y) <- names(ensembl_to_gene)
#load in canonical pathways for PLIER to read from
data(bloodCellMarkersIRISDMAP)
data(canonicalPathways)
load("../0_data/MetaMEx_matrix.Rds")
load("../0_data/FiberType_matrix.Rds")
ncol(combinePaths(bloodCellMarkersIRISDMAP, canonicalPathways))
## [1] 606
ncol(MetaMEx_matrix)
## [1] 6
ncol(FiberType_matrix)
## [1] 2
#pool them
allPaths <- combinePaths(bloodCellMarkersIRISDMAP, canonicalPathways, MetaMEx_matrix, FiberType_matrix)
#let's look at allPaths to understand it
dim(allPaths)
## [1] 8861 614
tail(colnames(allPaths)) #these are canonical pathways
## [1] "MetaMEx_AerobicTraining" "MetaMEx_ResistanceTraining"
## [3] "MetaMEx_HIITTraining" "MetaMEx_CombinedTraining"
## [5] "Type_I_markers" "Type_IIa_markers"
head(rownames(allPaths)) #these are genes
## [1] "GAS6" "MMP14" "MARCKSL1" "SPARC" "CTSD" "EPAS1"
#find genes in the pathways that are in our filtered data set
cm.genes <- commonRows(allPaths,y)
length(cm.genes) #7817
## [1] 7817
#normalize expression
yN <- rowNorm(y)
#compute principal components
num.pc(yN) #18
## Warning in if ((class(data) != "list") & (class(data) != "rsvd")) {: the
## condition has length > 1 and only the first element will be used
## Computing svd
## [1] 18
We set k to 25 based on guidance from the developers of PLIER to apply ~1.5x num.pc.
PLIER will generate key matrices that are coded by letters: Z: genes x LVs ("the loadings") B: LVs x samples ("the scores")
yN.svd <- svd(yN[cm.genes,])
Chat <- computeChat(allPaths[cm.genes,])
#MASplierResult <- PLIER(yN[cm.genes,], allPaths[cm.genes,],k=25, svdres=yN.svd, Chat=Chat)
load("../../2_pipeline/2_MASTERS_PLIER/MASplierResult.Rdata")
As described in Methods, we reran PLIER without Subjects P_36 and P_012. This was because we found that LVs driven by these two subjects were consistently not robust to sensitivity checks, and the impact of this extended into our WGCNA/PLIER cross-analysis. Thus, PLIER is performed again below without the four samples (both pre and post) for these subjects, i.e., n = 49. The resulting LVs were used to annotate the lists.
expression_data <- as.data.frame(combat_adj) %>%
dplyr::select(-contains("P_36_"), -contains("P_012_"))
dim(expression_data)
## [1] 58721 49
head(colnames(expression_data))
## [1] "P_008_1" "P_015_1" "P_048_1" "P_080_1" "P_081_1" "P_086_1"
subject_data <- pheno.long %>%
filter(SeqID %in% colnames(expression_data))
subject_data <- subject_data %>%
mutate(Timepoint = as.factor(Timepoint),
Batch = as.factor(Batch),
Sex = as.factor(Sex))
dim(subject_data)
## [1] 49 48
Check to make sure that it is compatible with the gene expression data.
#check using rownames
all(rownames(subject_data) %in% colnames(expression_data))
## [1] TRUE
dge <- DGEList(counts = expression_data)
keep <- rowSums(cpm(dge) > 0.5) >= 2
dge <- dge[keep, , keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)
z <- cpm(dge, log=TRUE)
z.means <- rowMeans(z)
rownames(gene_df) <- gene_df$link
gene <- gene_df[rownames(z),]
#map from ensembl ID to gene name
#this function applies a function to each item in 1:nrow(z) to find the location of the maximum value
ensembl_to_gene <- tapply(1:nrow(z),gene$gene_name,function(i){
i[which.max(z.means[i])]})
y <- z[ensembl_to_gene,]
rownames(y) <- names(ensembl_to_gene)
#find genes in the pathways that are in our filtered data set
cm.genes <- commonRows(allPaths,y)
length(cm.genes)
## [1] 7682
#normalize expression
yN <- rowNorm(y)
#compute principal components
num.pc(yN) #19
## Warning in if ((class(data) != "list") & (class(data) != "rsvd")) {: the
## condition has length > 1 and only the first element will be used
## Computing svd
## [1] 19
yN.svd <- svd(yN[cm.genes,])
Chat <- computeChat(allPaths[cm.genes,])
#MASplier_updated <- PLIER(yN[cm.genes,], allPaths[cm.genes,],k=25, svdres=yN.svd, Chat=Chat)
load("../../2_pipeline/2_MASTERS_PLIER/MASplier_updated.Rdata")
plotU(MASplier_updated, auc.cutoff = 0.50, fdr.cutoff = 0.05, top = 5)
These are useful to run in order to understand how the network was constructed. Additionally, items created in this chunk are used in later analysis.
load(file="../../2_pipeline/1_MASTERS_WGCNA/store/TEMP_basal_network.Rdata")
table(basal_network$colors)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 26412 4099 2329 2283 640 487 307 253 240 226 158 146 132
## 13 14
## 103 76
table(labels2colors(basal_network$colors))
##
## black blue brown cyan green greenyellow
## 253 2329 2283 76 487 146
## grey magenta pink purple red salmon
## 26412 226 240 158 307 103
## tan turquoise yellow
## 132 4099 640
mergedColors = labels2colors(basal_network$colors)
for (i in 1:length(basal_network$dendrograms))
{
plotDendroAndColors(basal_network$dendrograms[[i]], mergedColors[basal_network$blockGenes[[i]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
}
modLabels_basal = basal_network$colors
modColors_basal = labels2colors(basal_network$colors)
MEs_basal = moduleEigengenes(basal_expr, modColors_basal)$eigengenes
MEs_basal = orderMEs(MEs_basal)
geneTree_basal = basal_network$dendrograms[[1]]
nGenes = ncol(basal_expr);
nSamples = nrow(basal_expr);
length(MEs_basal)
## [1] 15
Prepare correlation matrix to look at primary outcome (CT) and secondary (DXA) for comparison.
nrow(resid.pheno.wide)==nrow(basal_expr) #good
## [1] TRUE
moduleTraitCor_basal = cor(MEs_basal,
resid.pheno.wide[,c("adj_d_CT_area","adj_delta_bilateral_thigh_normalized")], use="p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor_basal, nSamples = nrow(basal_expr));
#check for column names to make sure I am selecting the correct traits of interest
sizeGrWindow(8,8)
# Will display correlations and their p-val
textMatrix = signif(moduleTraitPvalue, 1);
textMatrix = paste(signif(moduleTraitCor_basal, 1), " (", signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor_basal)
par(mar = c(12, 8, 3, 2));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor_basal,
xLabels = c("change in mid-thigh muscle area (CT)","change in bilateral thigh mass (DEXA)"),
yLabels = names(MEs_basal),
ySymbols = names(MEs_basal),
colorLabels = FALSE,
colors = blueWhiteRed(100),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
cex.lab.x = 0.5,
cex.lab.y = 0.5,
xLabelsAdj = 1,
zlim = c(-1,1))
#main = "Change in Muscle Phenotype vs. Basal ME")
#verticalSeparator.x = c(4,7), verticalSeparator.col = "white", verticalSeparator.lwd = 10))
Let's also examine connections to myofiber CSA, an exploratory hypertrophy comparison. This is Figure S4A:
nrow(resid.pheno.wide)==nrow(basal_expr) #good
## [1] TRUE
moduleTraitCor_basal = cor(MEs_basal,
resid.pheno.wide[,c("adj_d_CT_area","adj_delta_bilateral_thigh_normalized",
"adj_d_myofiber_CSA")], use="p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor_basal, nSamples = nrow(basal_expr));
#check for column names to make sure I am selecting the correct traits of interest
sizeGrWindow(8,8)
textMatrix = signif(moduleTraitPvalue, 1);
textMatrix = paste(signif(moduleTraitCor_basal, 1), " (", signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor_basal)
par(mar = c(12, 8, 3, 2));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor_basal,
xLabels = c("change in mid-thigh muscle area (CT)","change in bilateral thigh mass (DXA)", "change in myofiber CSA"),
yLabels = names(MEs_basal),
ySymbols = names(MEs_basal),
colorLabels = FALSE,
colors = blueWhiteRed(100),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
cex.lab.x = 0.5,
cex.lab.y = 0.5,
xLabelsAdj = 1,
zlim = c(-1,1))
#main = "Change in Muscle Phenotype vs. Basal ME")
#verticalSeparator.x = c(4,7), verticalSeparator.col = "white", verticalSeparator.lwd = 10))
Apply p filtering cutoff to establish connections of interest
#calculate new p threshold using Bonferroni adjustment for n modules
p.threshold <- 0.05/(length(MEs_basal)-1) #do not penalize for grey module (leftovers)
#modules with correlation p values below adjusted threshold
IDmods <- as.data.frame(moduleTraitPvalue) %>%
filter((adj_delta_bilateral_thigh_normalized <= p.threshold) |
(adj_d_CT_area <= p.threshold) | (adj_d_myofiber_CSA <= p.threshold))
#modules with correlation p values below unadjusted p <0.05
nominal.mods <- as.data.frame(moduleTraitPvalue) %>%
filter((adj_delta_bilateral_thigh_normalized <= 0.05) |
(adj_d_CT_area <= 0.05) | (adj_d_myofiber_CSA <= 0.05))
#bring them together
mods_of_interest <- str_sub(unique(c(rownames(IDmods), rownames(nominal.mods))),3) #drop the ME
nominal.mods
Table S1 shows sizes (n genes) and mean module membership for Prediction WGCNA modules.
basal_mod_table <- ModuleTable(basal_expr, basal_network, modColors_basal)
basal_mod_table <- basal_mod_table %>%
rownames_to_column(var="module")
#write.csv(basal_mod_table,"../5_manuscript/Supplements/Tables/prediction_WGCNA_module_table.csv")
Table S2. The same is done for other modules of interest by passing the module color and trait of interest to the homemade "createTablefor" function.
#createTablefor(module = "red", ExprSet = basal_expr, modColors = modColors_basal, trait = resid.pheno.wide$adj_d_CT_area)
basal_red <- read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/redGeneTable.csv")
mods_of_interest <- as.data.frame(mods_of_interest)
names(mods_of_interest) = "module"
mods_of_interest$trait <- c("adj_d_CT_area", "adj_d_myofiber_CSA","adj_d_CT_area")
traits_of_interest <- resid.pheno.wide
mods_of_interest
These plots are presented in Figures 5A, 6A, and S4B.
basal_avg_expr <- moduleEigengenes(basal_expr, modColors_basal)
avg_expr <- basal_avg_expr$averageExpr
all(str_sub(names(avg_expr),3) %in% modColors_basal)
## [1] TRUE
basal_avg_expr <- avg_expr %>%
rownames_to_column(var="SeqID_w0")
plotWGCNA <- full_join(resid.pheno.wide, basal_avg_expr, by = "SeqID_w0")
mods_of_interest_AE <- mods_of_interest %>%
mutate(module = str_c("AE", module))
for (i in 1:nrow(mods_of_interest)){
print(ggplot(plotWGCNA) + geom_point(mapping=aes(x=plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$module[i]],
y=plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$trait[i]]),
color=str_sub(mods_of_interest_AE$module[i],3), size = 5) +
geom_smooth(mapping=aes(x=plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$module[i]],
y=plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$trait[i]]), method = "lm", colour = "black")+
scale_y_continuous(name = mods_of_interest_AE$trait[i]) +
scale_x_continuous(name = mods_of_interest_AE$module[i]) +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt")))
#ggsave(filename = paste0("../../3_output/1_MASTERS_WGCNA/Scatterplots/basal/",str_sub(mods_of_interest_AE$module[i],3),
#"_",mods_of_interest_AE$trait[i],".pdf"), width = 6, height = 3.5)
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
The values presented in figure legends are generated below.
red_CT <- lm(data=plotWGCNA, plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$module[1]] ~
plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$trait[1]])
summary(red_CT)
##
## Call:
## lm(formula = plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$module[1]] ~
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[1]],
## data = plotWGCNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76704 -0.26660 0.03821 0.25315 0.94908
##
## Coefficients:
## Estimate
## (Intercept) 1.557e-16
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[1]] 8.129e-02
## Std. Error
## (Intercept) 6.966e-02
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[1]] 1.357e-02
## t value Pr(>|t|)
## (Intercept) 0.000 1
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[1]] 5.988 1.64e-06
##
## (Intercept)
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[1]] ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3878 on 29 degrees of freedom
## Multiple R-squared: 0.5529, Adjusted R-squared: 0.5375
## F-statistic: 35.86 on 1 and 29 DF, p-value: 1.645e-06
magenta_CT <- lm(data=plotWGCNA, plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$module[2]] ~
plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$trait[2]])
summary(magenta_CT)
##
## Call:
## lm(formula = plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$module[2]] ~
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[2]],
## data = plotWGCNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6458 -0.3136 -0.1569 0.2479 1.3429
##
## Coefficients:
## Estimate
## (Intercept) -0.1151165
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[2]] -0.0003297
## Std. Error
## (Intercept) 0.1112658
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[2]] 0.0001124
## t value Pr(>|t|)
## (Intercept) -1.035 0.31320
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[2]] -2.934 0.00821
##
## (Intercept)
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[2]] **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5219 on 20 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.3009, Adjusted R-squared: 0.2659
## F-statistic: 8.607 on 1 and 20 DF, p-value: 0.008209
salmon_fCSA <- lm(data=plotWGCNA, plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$module[3]] ~
plotWGCNA[,names(plotWGCNA)==mods_of_interest_AE$trait[3]])
summary(salmon_fCSA)
##
## Call:
## lm(formula = plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$module[3]] ~
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[3]],
## data = plotWGCNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8136 -0.3157 -0.1167 0.1584 1.6475
##
## Coefficients:
## Estimate
## (Intercept) 8.779e-17
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[3]] 4.331e-02
## Std. Error
## (Intercept) 9.150e-02
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[3]] 1.783e-02
## t value Pr(>|t|)
## (Intercept) 0.000 1.0000
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[3]] 2.429 0.0216
##
## (Intercept)
## plotWGCNA[, names(plotWGCNA) == mods_of_interest_AE$trait[3]] *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5095 on 29 degrees of freedom
## Multiple R-squared: 0.1691, Adjusted R-squared: 0.1404
## F-statistic: 5.9 on 1 and 29 DF, p-value: 0.02157
Let's drop the extreme points for these and ensure they are upheld.
prediction_sens_check <- plotWGCNA %>%
filter(AEred<= mean(plotWGCNA$AEred)+3*sd(plotWGCNA$AEred) &
AEred>= mean(plotWGCNA$AEred)-3*sd(plotWGCNA$AEred))
AEred.fit <- lm(data = prediction_sens_check, AEred ~adj_d_CT_area)
summary(AEred.fit)
##
## Call:
## lm(formula = AEred ~ adj_d_CT_area, data = prediction_sens_check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73142 -0.17248 -0.01445 0.26970 0.55383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.04230 0.06125 -0.690 0.495573
## adj_d_CT_area 0.05862 0.01350 4.341 0.000167 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3337 on 28 degrees of freedom
## Multiple R-squared: 0.4023, Adjusted R-squared: 0.3809
## F-statistic: 18.85 on 1 and 28 DF, p-value: 0.0001673
prediction_sens_check <- plotWGCNA %>%
filter(AEmagenta<= mean(plotWGCNA$AEmagenta)+3*sd(plotWGCNA$AEmagenta) &
AEmagenta>= mean(plotWGCNA$AEmagenta)-3*sd(plotWGCNA$AEmagenta))
AEmagenta.fit <- lm(data = prediction_sens_check, AEmagenta ~adj_d_CT_area)
summary(AEmagenta.fit)
##
## Call:
## lm(formula = AEmagenta ~ adj_d_CT_area, data = prediction_sens_check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74251 -0.25738 -0.07633 0.18304 0.96782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05501 0.07494 -0.734 0.46906
## adj_d_CT_area 0.04093 0.01438 2.847 0.00818 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4105 on 28 degrees of freedom
## Multiple R-squared: 0.2245, Adjusted R-squared: 0.1968
## F-statistic: 8.104 on 1 and 28 DF, p-value: 0.008178
prediction_sens_check <- plotWGCNA %>%
filter(AEsalmon<= mean(plotWGCNA$AEsalmon)+3*sd(plotWGCNA$AEsalmon) &
AEsalmon>= mean(plotWGCNA$AEsalmon)-3*sd(plotWGCNA$AEsalmon))
AEsalmon.fit <- lm(data = prediction_sens_check, AEsalmon ~adj_d_myofiber_CSA)
summary(AEsalmon.fit)
##
## Call:
## lm(formula = AEsalmon ~ adj_d_myofiber_CSA, data = prediction_sens_check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6458 -0.3136 -0.1569 0.2479 1.3429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1151165 0.1112658 -1.035 0.31320
## adj_d_myofiber_CSA -0.0003297 0.0001124 -2.934 0.00821 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5219 on 20 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.3009, Adjusted R-squared: 0.2659
## F-statistic: 8.607 on 1 and 20 DF, p-value: 0.008209
We see that it's upheld without the extremely high point. Based on the figures for magenta and salmon, I don't really see an extreme data point that we might suppose is driving the module-trait effect.
for (i in 1:nrow(mods_of_interest)){
createTablefor(module = mods_of_interest$module[i], ExprSet = basal_expr, modColors = modColors_basal,
#trait = traits_of_interest[,names(traits_of_interest)==mods_of_interest$trait[i]],
#filepath = "/data/project/bamman-lab/MASTERS/00_preliminary/prelim_pipeline_development/WGCNA/")
createPlotfor(module = mods_of_interest$module[i], ExprSet = basal_expr, Network = basal_network,
modColors = modColors_basal,
trait = traits_of_interest[,names(traits_of_interest)==mods_of_interest$trait[i]],
filepath = "/data/project/bamman-lab/MASTERS/3_output/1_MASTERS_WGCNA/Scatterplots/basal/")
}
Topological Overlap Matrix run as job in UAB Cheaha (09.24.2020).
#basal_TOM = TOMsimilarityFromExpr(basal_expr, power = 12)
load(file="../../2_pipeline/1_MASTERS_WGCNA/store/basal_TOM.Rdata")
modules_to_pursue <- str_sub(mods_of_interest$module)
for (i in 1:length(modules_to_pursue)){
filepath <- str_c("../../3_output/1_MASTERS_WGCNA/GeneDetails/",
modules_to_pursue[i],"_module_details.csv", sep = "")
x <- paste(modules_to_pursue[i],"_module_details",sep="")
assign(x,read.csv(file=filepath, header = TRUE, row.names = 1))
}
getCytoscape(module = "red", detailTable = red_module_details, modColors = modColors_basal,
ExprSet = basal_expr, fraction = "quadrant_4", TOM_matrix = basal_TOM)
getCytoscape(module = "salmon", detailTable = salmon_module_details, modColors = modColors_basal,
ExprSet = basal_expr, fraction = "top_half", TOM_matrix = basal_TOM)
getCytoscape(module = "magenta", detailTable = magenta_module_details, modColors = modColors_basal,
ExprSet = basal_expr, fraction = "quadrant_4", TOM_matrix = basal_TOM)
The cytoscape output was used to generate figures 5C, 6C, and S4D.
simpledf <- subject_data %>%
mutate(Paired = ifelse(SeqID %in% pheno.long.paired$SeqID,1,0),
TimePoint = as.factor(ifelse(Paired==0&Timepoint==1,"1",ifelse(Paired==1&Timepoint==1,"2","3")))) %>%
dplyr::select("SeqID","TimePoint","Sex", "Batch") %>%
arrange(TimePoint, Sex, Batch)
newyN <- yN[,match(rownames(simpledf), colnames(yN))]
all.equal(rownames(simpledf),colnames(newyN))
## [1] TRUE
#load("../../2_pipeline/2_MASTERS_PLIER/MASplier_updated.Rdata")
subject_data <- pheno.long %>%
filter(SeqID %in% colnames(MASplier_updated$B))
#adjusted normalized values
adj.vars.of.interest <- c("adj_bilateral_thigh_mass_normalized", "adj_avg_area", "adj_Avg_CSA")
#change <- resid.pheno.wide %>%
#filter(SeqID %in% subject_data$SeqID) %>%
#dplyr::select(SeqID, any_of(adj.vars.of.interest))
#adjusted deltas
predict <- resid.pheno.wide %>%
filter(SeqID_w0 %in% subject_data$SeqID) %>%
dplyr::select(SeqID_w0, adj_delta_bilateral_thigh_normalized, adj_d_CT_area, adj_d_myofiber_CSA) %>%
rename("SeqID" = "SeqID_w0")
pathdata_updated <- as.data.frame(t(MASplier_updated$B)) %>%
rownames_to_column(var = "SeqID") %>%
mutate(Sex = subject_data$Sex[match(SeqID, subject_data$SeqID)],
TimePoint = subject_data$Timepoint[match(SeqID, subject_data$SeqID)],
ExerciseEffect = ifelse(!SeqID %in% pheno.long.paired$SeqID,NA,TimePoint)) %>%
#left_join(change, by ="SeqID") %>%
left_join(predict, by = "SeqID") %>%
column_to_rownames(var = "SeqID")
for (i in 1:nrow(MASplier_updated$B)){
colnames(pathdata_updated)[i] <- paste(c("LV",i),collapse = "")
}
pathdata_updated
The information here is used to create Table 2.
pathdata_updated_predict <- pathdata_updated %>%
filter(!is.na(adj_d_CT_area)) #29
signifTable_predict_updated <- as.data.frame(matrix(nrow = nrow(MASplier_updated$B))) %>%
mutate(LV = rownames(MASplier_updated$B),adj_delta_bilateral_thigh_normalized = NA,
adj_d_CT_area = NA, adj_d_myofiber_CSA = NA) %>%
dplyr::select(-V1)
for(i in 1:25){
j = pathdata_updated_predict[,i]
fit1 <- lm(data = pathdata_updated_predict, formula = j ~ adj_delta_bilateral_thigh_normalized)
signifTable_predict_updated$adj_delta_bilateral_thigh_normalized[i]=round(summary(fit1)[["coefficients"]][2,4], digits=4)
fit2 <- lm(data = pathdata_updated_predict, formula = j ~ adj_d_CT_area)
signifTable_predict_updated$adj_d_CT_area[i]=round(summary(fit2)[["coefficients"]][2,4], digits=4)
fit3 <- lm(data = pathdata_updated_predict, formula = j ~ adj_d_myofiber_CSA)
signifTable_predict_updated$adj_d_myofiber_CSA[i]=round(summary(fit3)[["coefficients"]][2,4], digits=4)
}
LVs_of_interest_prediction_updated <- signifTable_predict_updated %>%
filter(adj_delta_bilateral_thigh_normalized <= 0.05 |
adj_d_CT_area <= 0.05 |
adj_d_myofiber_CSA <= 0.05) %>%
mutate(LVid = str_extract(LV, pattern="[:digit:]{1,}"))
#present it in a sparse fashion to make it easier to look at table
LVs_of_interest_prediction_updated %>%
mutate(adj_delta_bilateral_thigh_normalized = ifelse(adj_delta_bilateral_thigh_normalized<0.05,
round(adj_delta_bilateral_thigh_normalized,3), NA),
adj_d_CT_area = ifelse(adj_d_CT_area<0.05, round(adj_d_CT_area,3), NA),
adj_d_myofiber_CSA = ifelse(adj_d_myofiber_CSA<0.05, round(adj_d_myofiber_CSA,3), NA)) %>%
relocate(LVid)
These are all candidates for follow up. Those that pass the sensitivity checks will be presented.
ggplot(pathdata_updated_predict) +
geom_point(mapping=aes(x=LV7,y=adj_d_CT_area),colour="black", size = 3) +
geom_smooth(mapping=aes(x=LV7,y=adj_d_CT_area), method = "lm", colour = "black") +
scale_y_continuous(name = "adj_d_CT_area") +
scale_x_continuous(name = "LV7") +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt"))
## `geom_smooth()` using formula 'y ~ x'
lv7fit <- lm(data=pathdata_updated_predict, LV7 ~ adj_d_CT_area)
summary(lv7fit)
##
## Call:
## lm(formula = LV7 ~ adj_d_CT_area, data = pathdata_updated_predict)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7812 -0.6716 0.1482 0.8477 1.2826
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03872 0.16057 0.241 0.8113
## adj_d_CT_area -0.08400 0.03576 -2.349 0.0264 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8546 on 27 degrees of freedom
## Multiple R-squared: 0.1697, Adjusted R-squared: 0.139
## F-statistic: 5.519 on 1 and 27 DF, p-value: 0.02638
vars.of.interest <- c("adj_delta_bilateral_thigh_normalized","adj_d_CT_area","adj_d_myofiber_CSA")
continuousdf <- resid.pheno.wide %>%
dplyr::select("SubjectID","SeqID_w0","Batch", any_of(vars.of.interest)) %>%
filter(SeqID_w0 %in% colnames(as.data.frame(MASplier_updated$B))) %>% #make sure we use same samples
mutate(Batch = ifelse(SubjectID=="UKMET08",3,Batch),
Batch = ifelse(SubjectID=="UKMET05",3,Batch)) %>%
column_to_rownames(var = "SeqID_w0")
We ran continuous heatmaps for all the PLIER LVs in order to visualize the relationship with the change in CT, DEXA, and fCSA. Below is the code, drawing from the customplotTopZcontinuous function
for (i in 1:nrow(MASplier_updated$B)){
filedir <- paste0("../../3_output/2_MASTERS_PLIER/SensitivityCheck/Heatmaps/DXA_basal/continuous_heatmaps_DXA",i,".pdf")
customplotTopZcontinuous(plierRes = MASplier_updated, variable = vars.of.interest[1],
priorMat = allPaths, top = 50, index = i, regress = FALSE, allLVs = TRUE,
width = 15, height = 10, filename=filedir)
}
for (i in 1:nrow(MASplier_updated$B)){
filedir <- paste0("../../3_output/2_MASTERS_PLIER/SensitivityCheck/Heatmaps/CT_basal/continuous_heatmaps_CT",i,".pdf")
customplotTopZcontinuous(plierRes = MASplier_updated, variable = vars.of.interest[2],
priorMat = allPaths, top = 50, index = i, regress = FALSE, allLVs = TRUE,
width = 15, height = 10, filename=filedir)
}
for (i in 1:nrow(MASplier_updated$B)){
filedir <- paste0("../../3_output/2_MASTERS_PLIER/SensitivityCheck/Heatmaps/fCSA_basal/continuous_heatmaps_fCSA",i,".pdf")
customplotTopZcontinuous(plierRes = MASplier_updated, variable = vars.of.interest[3],
priorMat = allPaths, top = 50, index = i, regress = FALSE, allLVs = TRUE,
width = 15, height = 10, filename=filedir)
}
I will show LV7 here, which is Figure 7B.
customplotTopZcontinuous(plierRes = MASplier_updated, variable = "adj_d_CT_area",
priorMat = allPaths, top = 50, index = 7, regress = FALSE, allLVs = TRUE,
width = 15, height = 10)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(variable)` instead of `variable` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
LVs_prediction <- as.data.frame(LVs_of_interest_prediction_updated$LVid) %>%
rename("LV" = "LVs_of_interest_prediction_updated$LVid") %>%
mutate(LV = as.numeric(LV)) %>%
mutate(trait = c("adj_d_CT_area","adj_d_CT_area","adj_d_myofiber_CSA","adj_d_CT_area",
"adj_delta_bilateral_thigh_normalized","adj_delta_bilateral_thigh_normalized","adj_d_CT_area"))
#put them in manually because I couldn't figure out how to make it ID columns that have a value <0.05 for each row :(
for (i in 1:nrow(LVs_prediction)){
print(ggplot(pathdata_updated_predict) +
geom_point(mapping=aes(x=pathdata_updated_predict[,LVs_prediction$LV[i]],
y=pathdata_updated_predict[,names(pathdata_updated_predict)==LVs_prediction$trait[i]]),
colour="black", size = 3) +
geom_smooth(mapping=aes(x=pathdata_updated_predict[,LVs_prediction$LV[i]],
y=pathdata_updated_predict[,names(pathdata_updated_predict)==LVs_prediction$trait[i]]),
method = "lm", colour = "black") +
scale_y_continuous(name = LVs_prediction$trait[i]) +
scale_x_continuous(name = str_c("LV",LVs_prediction$LV[i])) +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt")))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
outlier_check<- as.data.frame(matrix(ncol = 6, nrow=25))
names(outlier_check) <- c("LV","n","r_val_adj_d_CT_area","p_val_adj_d_CT_area",
"r_val_adj_delta_bilateral_thigh_normalized","p_val_adj_delta_bilateral_thigh_normalized")
for (i in 1:25){
name = str_c("LV",i)
outlier_check$LV[i] = name
clean_path <- pathdata_updated_predict %>% #drops anyone above and below 3 standard deviations
filter(pathdata_updated_predict[,name]<= mean(pathdata_updated_predict[,name])+
3*sd(pathdata_updated_predict[,name]) &
pathdata_updated_predict[,name]>= mean(pathdata_updated_predict[,name])-
3*sd(pathdata_updated_predict[,name]))
outlier_check$n[i] <- nrow(clean_path)
fit <- lm(data = clean_path, clean_path[,name] ~adj_d_CT_area)
outlier_check$r_val_adj_d_CT_area[i] <- round(summary(fit)$r.squared,3)
outlier_check$p_val_adj_d_CT_area[i] <- round(summary(fit)[["coefficients"]][2,4],3)
fit2 <- lm(data = clean_path, clean_path[,name] ~adj_delta_bilateral_thigh_normalized)
outlier_check$r_val_adj_delta_bilateral_thigh_normalized[i] <- round(summary(fit2)$r.squared,3)
outlier_check$p_val_adj_delta_bilateral_thigh_normalized[i] <- round(summary(fit2)[["coefficients"]][2,4],3)
}
#present LVs that survive the check for CT and DXA; write to a variable so we can use it later
LV_sensitivity_check <- outlier_check %>%
filter(LV %in% str_c("LV",LVs_of_interest_prediction_updated$LVid)) %>%
filter(p_val_adj_d_CT_area <.05 | p_val_adj_delta_bilateral_thigh_normalized<0.05)
LV_sensitivity_check
Three are upheld for the CT comparison (LVs 7, 16, 23), and one for DXA (LV 19). These four, in total, are presented in Table 2 as PLIER Prediction LVs.
Export the top 100 genes in these LVs of interest. These are Tables S5, 6, and 8.
prediction_LVs <- LVs_of_interest_prediction_updated
CT_LVs <- prediction_LVs %>%
filter(str_c("LV",LVid) %in% LV_sensitivity_check$LV,
adj_d_CT_area <= 0.05)
for (i in 1:nrow(CT_LVs)){
LVtablemaker(MASplier_updated, newyN, allPaths, index = CT_LVs$LVid[i], top = 100,
path = "../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/CT/")
}
We use the mod_LV_check function to find the top association for the top LV that was associated with key module genes. These are appended to the gene information in Tables S2-4.
updated_red_LV <- mod_LV_check(plierResult = MASplier_updated, genetable =
read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/redGeneTable.csv")) %>%
mutate(pathway = gsub(".*,","",LV_path)) #condense because some LVs have the same pathway
updated_red_LV %>%
group_by(pathway) %>%
summarise(n = length(geneID)) %>%
arrange(-n)
Here we can see the number for each LV for the top genes in red. This is where it became apparent in the original analysis that the two outlying samples were driving more of the signatures seen in PLIER than they were for WGCNA. Above we see the corrected table, run with the n=49 samples in PLIER.
We did the same thing for the other mods of interest.
magenta_LV_associations <- mod_LV_check(MASplier_updated, genetable =
read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/magentaGeneTable.csv"))
salmon_LV_associations <- mod_LV_check(MASplier_updated, genetable =
read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/salmonGeneTable.csv"))
For this, we will pool one outcome at a time.
for (i in 1:length(list.files("../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/CT/"))){
name <- gsub("_.*","",list.files("../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/CT/")[i])
j <- paste("../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/CT/",
list.files("../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/CT/")[i],sep="")
assign(name, read.csv(file = j))
}
all_CT_genes <- unique(c(LV7$X, LV16$X, LV23$X))
#going to combine red and magenta since they both have a CT association
red_gene_table <- read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/redGeneTable.csv")
magenta_gene_table <- read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/magentaGeneTable.csv")
red_magenta <- unique(c(red_gene_table$symbol, magenta_gene_table$symbol))
IDoverlaps(all_CT_genes, red_magenta) #nothing
## character(0)
The top genes in LV19 are presented in Table S7.
DXA_LVs <- LV_sensitivity_check %>%
filter(p_val_adj_delta_bilateral_thigh_normalized<=0.05)
#LVtablemaker(MASplier_updated, newyN, allPaths, index = 19, top = 100,
#path = "/data/project/bamman-lab/MASTERS/3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/DXA/")
LV19 <- read.csv("../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/DXA/LV19_table_Top100.csv")
overlaps <- IDoverlaps(LV19$X, red_magenta) #3
LV19 %>%
filter(X %in% overlaps)
This code generates the middle column (i.e., Response) in manuscript Table 2.
pathdata_updated.RT <- pathdata_updated %>% filter(!is.na(ExerciseEffect)) #40 rows is correct, 20 pairs
signifTable_updated <- as.data.frame(matrix(nrow = nrow(MASplier_updated$B))) %>%
mutate(LV = rownames(MASplier_updated$B),p_RTeffect = NA, adj_p_RTeffect = NA) %>%
dplyr::select(-V1)
pathdata_updated.RT <- pathdata_updated %>% filter(!is.na(ExerciseEffect))
for(i in 1:25){
ttest <- t.test(formula=pathdata_updated.RT[,i]~pathdata_updated.RT$TimePoint, paired=TRUE)
signifTable_updated$p_RTeffect[i]<- round(ttest$p.value, digits = 6)
signifTable_updated$adj_p_RTeffect[i] <- signifTable_updated$p_RTeffect[i]*25
}
LVs_of_interest_exercise_updated <- signifTable_updated %>%
filter(signifTable_updated$adj_p_RTeffect <= 0.05) %>%
mutate(LVid = str_extract(LV, pattern="[:digit:]{1,}"))
LV_sensitivity_check_response <- LVs_of_interest_exercise_updated %>%
relocate(LVid) %>%
dplyr::select(-p_RTeffect) %>%
mutate(adj_p_RTeffect = round(adj_p_RTeffect,3))
LV_sensitivity_check_response
We can generate box plots for these using the PLIERbox function.
#condense all baseline into one category
simpledf1 <- simpledf %>%
rownames_to_column(var = "seqID") %>%
mutate(PrePost = ifelse(simpledf$TimePoint==3,"Post-RT","Baseline"),
subjectID = str_sub(seqID,0,-3))
for (i in 1:nrow(LVs_of_interest_exercise_updated)){
print(PlierBox(plierRes = MASplier_updated, index = as.numeric(LVs_of_interest_exercise_updated$LVid[i]),
subject.data = simpledf1, group.var = "PrePost",paired = TRUE, subject.var = "subjectID"))
}
exercise <- as.numeric(LVs_of_interest_exercise_updated$LVid)
examine_exercise <- as.data.frame(t(MASplier_updated$B[exercise,]))
names(examine_exercise) <- str_c("LV",LVs_of_interest_exercise_updated$LVid)
We create heatmaps for all LVs here. For LVs 8, 2, 6, and 15, the resulting plots are presented in Figures S5 and 6.
for (i in 1:nrow(MASplier_updated$B)){
filedir <- paste0("../../3_output/2_MASTERS_PLIER/SensitivityCheck/Heatmaps/ExEffect/heatmap",i,".pdf")
customplotTopZ(plierRes = MASplier_updated, data = newyN, priorMat = allPaths, top = 50, index = i, allLVs = TRUE,
annotation_col = simpledf, width = 15, height = 10,
filename=filedir)
}
simpledf$SeqID <- NULL
customplotTopZ(plierRes = MASplier_updated, data = newyN, priorMat = allPaths, top = 50, index = 19, allLVs = TRUE,
annotation_col = simpledf)
customplotTopZ(plierRes = MASplier_updated, data = newyN, priorMat = allPaths, top = 50, index = 25, allLVs = TRUE,
annotation_col = simpledf)
### Response LV Tables The top 100 genes in the Response LVs (2, 6, 8, 15, 19, 25) are presented in Tables S9-14.
for (i in 1:nrow(LVs_of_interest_exercise_updated)){
LVtablemaker(MASplier_updated, newyN, allPaths, index = as.numeric(LVs_of_interest_exercise_updated$LVid)[i],
top = 100, path = "../5_manuscript/Supplements/Tables/")
}
Below, we examine the paired network, constructed from only participants with seq data at both time points.
load(file="../../2_pipeline/1_MASTERS_WGCNA/store/TEMP_paired_network.Rdata")
table(paired_network$colors)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 28893 3822 2589 1108 826 713 544 542 205 173 127 110 78
table(labels2colors(paired_network$colors))
##
## black blue brown green greenyellow grey
## 542 2589 1108 713 110 28893
## magenta pink purple red tan turquoise
## 173 205 127 544 78 3822
## yellow
## 826
mergedColors = labels2colors(paired_network$colors)
for (i in 1:length(paired_network$dendrograms))
{
plotDendroAndColors(paired_network$dendrograms[[i]], mergedColors[paired_network$blockGenes[[i]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
}
modLabels_paired = paired_network$colors
modColors_paired = labels2colors(paired_network$colors)
MEs_paired = moduleEigengenes(paired_expr, modColors_paired)$eigengenes
MEs_paired = orderMEs(MEs_paired)
geneTree_paired = paired_network$dendrograms[[1]]
nGenes = ncol(paired_expr);
nSamples = nrow(paired_expr);
length(MEs_paired)
## [1] 13
It has similar features to the baseline, in that most are in grey but turqouise is otherwise the largest.
The module characteristics for the Plasticity analysis are shown in Table S16.
paired_mod_table <- ModuleTable(paired_expr, paired_network, modColors_paired) %>%
rownames_to_column(var="module")
We can appreciate the variability in starting ME scores by plotting them and seeing how challenging it is to interpret.
resid.pheno.paired <- resid.pheno.paired %>%
arrange(match(rownames(resid.pheno.paired),rownames(MEs_paired)))
all.equal(rownames(resid.pheno.paired), rownames(MEs_paired))
## [1] TRUE
pheno.paired.ME <- cbind(resid.pheno.paired, MEs_paired)
#an example
ggplot(pheno.paired.ME) +
geom_line(mapping = aes(x = Timepoint, y = MEpink, group = SubjectID), color = "pink") +
labs(y = "MEpink") + theme_bw()
for (i in 1:(length(MEs_paired)-1)){
module = names(pheno.paired.ME[ncol(pheno.long.paired) + i])
print(ggplot(pheno.paired.ME) +
geom_line(mapping = aes(x = Timepoint, y = pheno.paired.ME[,module],
group = SubjectID), color = str_sub(module, 3)) +
labs(y = str_sub(module, 3)) + theme_bw())
}
all.equal(row.names(resid.pheno.paired), rownames(MEs_paired))
## [1] TRUE
resid.pheno.paired.ME <- cbind(resid.pheno.paired, MEs_paired)
adj.pheno.paired.ME <- resid.pheno.paired.ME %>%
rownames_to_column(var = "SeqID") %>%
dplyr::select("SubjectID","SeqID","Timepoint","Batch","Age","Sex","Treatment",
contains("adj_"), starts_with("ME", ignore.case = FALSE)) %>%
column_to_rownames(var = "SeqID")
deltas <- adj.pheno.paired.ME %>%
dplyr::select(SubjectID, Timepoint, starts_with("ME", ignore.case = FALSE)) %>%
pivot_wider(id_cols = SubjectID, names_from = Timepoint, values_from = starts_with("ME", ignore.case = FALSE)) %>%
column_to_rownames(var = "SubjectID")
CBI <- data.frame(row.names = rownames(deltas))
j <- names(MEs_paired)
for (i in 1:length(j)){
a <- str_c(j[i],"_1")
b <- str_c(j[i],"_3")
CBI[,str_c((j[i]),"_cbi")] <- 1-(deltas[,a]/deltas[,b])
}
CBI <- CBI %>%
rownames_to_column(var = "SubjectID")
hist(CBI$MEblack_cbi) #just look at an example
#I think we have to go back to long
CBI_long <- CBI %>%
mutate(SeqID = MASTERSpheno$SeqID_w16[match(SubjectID, MASTERSpheno$SubjectID)])
CBI_long2 <- CBI %>% #set pre to 0 based on 1-pre/pre
mutate(SeqID = MASTERSpheno$SeqID_w0[match(SubjectID, MASTERSpheno$SubjectID)]) %>%
mutate_if(.predicate = str_detect(names(CBI_long), "^ME"), .funs = ~ (.x = 0))
CBI_plot <- bind_rows(CBI_long2, CBI_long) %>%
mutate(Timepoint = pheno.long.paired$Timepoint[match(SeqID, pheno.long.paired$SeqID)]) %>%
column_to_rownames(var = "SeqID")
ggplot(CBI_plot) +
geom_line(mapping = aes(x = Timepoint, y = MEpink_cbi,group = SubjectID), color = "pink") +
labs(y = "pink_cbi") + theme_bw()
for (i in 2:(2+length(names(MEs_paired))-1)){
module = names(CBI_plot)[i]
print(ggplot(CBI_plot) +
geom_line(mapping = aes(x = Timepoint, y = CBI_plot[,module],
group = SubjectID), color = str_sub(module,3,-5)) +
labs(y = str_sub(module, 3)) + theme_bw())
}
We are going to drop all the baselines which are essentially just adding zeros to the data set.
CBI_df <- CBI_long %>%
dplyr::select(-SubjectID) %>%
mutate(SeqID = str_c(str_sub(SeqID,0,-3),"_1",sep="")) %>%
column_to_rownames(var = "SeqID")
paired_change <- resid.pheno.wide %>%
filter(SeqID_w0 %in% rownames(CBI_df)) %>% #22 people
arrange(SeqID_w0 = match(SeqID_w0, rownames(CBI_df))) %>%
column_to_rownames(var = "SeqID_w0")
#check that it's compatible with the other data
all.equal(rownames(CBI_df), rownames(paired_change))
## [1] TRUE
CBI_df <- CBI_df[rownames(CBI_df),]
all.equal(rownames(CBI_df), rownames(paired_change))
## [1] TRUE
nSamples = nrow(CBI_df)
moduleTraitCor_paired = cor(CBI_df,
paired_change[,c("adj_d_CT_area", "adj_delta_bilateral_thigh_normalized",
"adj_d_myofiber_CSA")], use="p");
moduleTraitPvalue_paired = corPvalueStudent(moduleTraitCor_paired, nSamples);
sizeGrWindow(8,8)
textMatrix = paste(signif(moduleTraitCor_paired, 1), " (", signif(moduleTraitPvalue_paired, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor_paired)
par(mar = c(12, 8, 3, 2));
labeledHeatmap(Matrix = moduleTraitCor_paired,
xLabels = c("adjusted_change_area_CT","adjusted_change_DEXA",
"adj_change_fCSA"),
yLabels = names(MEs_paired),
ySymbols = names(MEs_paired),
colorLabels = FALSE,
colors = blueWhiteRed(100),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
cex.lab.x = 0.8,
cex.lab.y = 0.5,
cex.main = 0.8,
zlim = c(-1,1),
main = "Change in Muscle Phenotype vs. ME Change Indices")
p.threshold <- 0.05/(length(MEs_paired)-1) #do not penalize for grey module
IDmods <- as.data.frame(moduleTraitPvalue_paired) %>%
filter(rownames(moduleTraitPvalue_paired)!="MEgrey_cbi") %>%
filter((adj_delta_bilateral_thigh_normalized <= p.threshold) |
(adj_d_CT_area <= p.threshold) | (adj_d_myofiber_CSA <= p.threshold))
nominal.mods <- as.data.frame(moduleTraitPvalue_paired) %>%
filter(rownames(moduleTraitPvalue_paired)!="MEgrey_cbi") %>%
filter((adj_delta_bilateral_thigh_normalized <= 0.05) |
(adj_d_CT_area <= 0.05) | (adj_d_myofiber_CSA <= 0.05))
mods_of_interest_plastic <- str_sub(unique(c(rownames(IDmods), rownames(nominal.mods))),3) #drop the ME
nominal.mods
The pink module scatterplot is shown in Figure S7B.
paired_CBI <- cbind(paired_change, CBI_df)
ggplot(paired_CBI) + geom_point(mapping=aes(x=MEpink_cbi,y=adj_d_myofiber_CSA),colour="pink", size = 3) +
geom_smooth(mapping=aes(x=MEpink_cbi,y=adj_d_myofiber_CSA), method = "lm", colour = "black") +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt"))
## `geom_smooth()` using formula 'y ~ x'
#+ggsave(filename = "../../3_output/1_MASTERS_WGCNA/Scatterplots/paired/pinkCBI_vs_delta_fCSA.pdf", width = 6, height = 3.5)
ggplot(paired_CBI) + geom_point(mapping=aes(x=MEpurple_cbi,y=adj_delta_bilateral_thigh_normalized),colour="purple", size = 3) +
geom_smooth(mapping=aes(x=MEpurple_cbi,y=adj_delta_bilateral_thigh_normalized), method = "lm", colour = "black") +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt"))
## `geom_smooth()` using formula 'y ~ x'
#+ggsave(filename = "../../3_output/1_MASTERS_WGCNA/Scatterplots/paired/purpleCBI_vs_delta_DEXA.pdf", width = 6, height = 3.5)
ggplot(paired_CBI) + geom_point(mapping=aes(x=MEred_cbi,y=adj_delta_bilateral_thigh_normalized),colour="red", size = 3) +
geom_smooth(mapping=aes(x=MEred_cbi,y=adj_delta_bilateral_thigh_normalized), method = "lm", colour = "black") +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt"))
## `geom_smooth()` using formula 'y ~ x'
Both red and purple have a single very low one, and the biggest loser on fCSA is also very low on pink, so we will run a sensitivity check on these.
paired_CBI_sens_check <- paired_CBI %>%
filter(MEpink_cbi<= mean(paired_CBI$MEpink_cbi)+3*sd(paired_CBI$MEpink_cbi) &
MEpink_cbi>= mean(paired_CBI$MEpink_cbi)-3*sd(paired_CBI$MEpink_cbi))
CBIpink.fit <- lm(data = paired_CBI_sens_check, MEpink_cbi ~adj_d_myofiber_CSA)
summary(CBIpink.fit)
##
## Call:
## lm(formula = MEpink_cbi ~ adj_d_myofiber_CSA, data = paired_CBI_sens_check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.1812 -2.2815 0.7174 3.8365 6.9491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.113180 1.175594 -0.947 0.35499
## adj_d_myofiber_CSA 0.004411 0.001187 3.715 0.00137 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.514 on 20 degrees of freedom
## Multiple R-squared: 0.4083, Adjusted R-squared: 0.3787
## F-statistic: 13.8 on 1 and 20 DF, p-value: 0.001369
paired_CBI_sens_check <- paired_CBI %>%
filter(MEred_cbi<= mean(paired_CBI$MEred_cbi)+3*sd(paired_CBI$MEred_cbi) &
MEred_cbi>= mean(paired_CBI$MEred_cbi)-3*sd(paired_CBI$MEred_cbi))
CBIred.fit <- lm(data = paired_CBI_sens_check, MEred_cbi ~adj_delta_bilateral_thigh_normalized)
summary(CBIred.fit)
##
## Call:
## lm(formula = MEred_cbi ~ adj_delta_bilateral_thigh_normalized,
## data = paired_CBI_sens_check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88045 -0.57530 -0.04829 0.84802 2.13651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.30082 0.23276 -1.292 0.212
## adj_delta_bilateral_thigh_normalized -0.01703 0.02084 -0.817 0.424
##
## Residual standard error: 1.063 on 19 degrees of freedom
## Multiple R-squared: 0.03396, Adjusted R-squared: -0.01688
## F-statistic: 0.6679 on 1 and 19 DF, p-value: 0.4239
Red is not upheld.
paired_CBI_sens_check <- paired_CBI %>%
filter(MEpurple_cbi<= mean(paired_CBI$MEpurple_cbi)+3*sd(paired_CBI$MEred_cbi) &
MEpurple_cbi>= mean(paired_CBI$MEpurple_cbi)-3*sd(paired_CBI$MEred_cbi))
CBIpurple.fit <- lm(data = paired_CBI_sens_check, MEpurple_cbi ~adj_delta_bilateral_thigh_normalized)
summary(CBIpurple.fit)
##
## Call:
## lm(formula = MEpurple_cbi ~ adj_delta_bilateral_thigh_normalized,
## data = paired_CBI_sens_check)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.30118 -0.47785 -0.02571 0.67205 2.20435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.524108 0.219631 2.386 0.0276 *
## adj_delta_bilateral_thigh_normalized -0.006458 0.019665 -0.328 0.7462
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.003 on 19 degrees of freedom
## Multiple R-squared: 0.005643, Adjusted R-squared: -0.04669
## F-statistic: 0.1078 on 1 and 19 DF, p-value: 0.7462
Also purple does not stand up. We will only pursue the pink for downstream analysis.
For the mod members table, Gene-Trait significance is not shown (see text). This results because we have changed the eigengene value to a change score; therefore it's challenging to get the association with the trait. The module membership for genes in the pink module is presented in Table S17.
MEs = moduleEigengenes(paired_expr, modColors_paired)$eigengenes
modNames = substring(names(MEs), 3)
nSamples = nrow(paired_expr)
#code here is the same thing that generates module gene lists and makes scatterplots
geneModuleMembership = as.data.frame(cor(paired_expr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
column = match("pink", modNames);
moduleGenes = modColors_paired=="pink";
pink_modmem <- data.frame(abs(geneModuleMembership[moduleGenes, column]),
row.names = rownames(geneModuleMembership[moduleGenes,]))
names(pink_modmem)[names(pink_modmem)=="abs.geneModuleMembership.moduleGenes..column.."] <- "membership"
pink_modmem <- pink_modmem %>%
rownames_to_column(var = "geneIDs") %>%
separate(geneIDs, into = c("ensg_version","symbol"), sep="_")
threshold <- median(pink_modmem$membership) #just the 50th percentile
perc_75th <- quantile(pink_modmem$membership, probs = seq(0, 1, 0.25))[4]
perc_80th <- quantile(pink_modmem$membership, probs = seq(0, 1, 0.4))[3]
#ran both to see if there was a big difference in number of genes surpassing threshold
pink_modmem <- pink_modmem %>%
mutate(top_half = ifelse(membership>=threshold,1,NA),
perc_75th = ifelse(membership>=perc_75th,1,NA),
perc_80th = ifelse(membership>=perc_80th,1,NA)) %>%
arrange(-membership)
pink_modmem
The paired TOM were run in Cheaha on 09.24.20. Below we unpack the getCytoscape function and apply it for use for this one paired module. We have to do this because of the key limitation which is no gene-trait significance column. Instead, we limit interpretation to those genes with the highest membership (kME over 80th percentile). This code is not run live in the knit markdown because of computational limitations. Please see Figure S7C for output.
#paired_TOM = TOMsimilarityFromExpr(paired_expr, power = 12)
#load(file = "../../2_pipeline/1_MASTERS_WGCNA/store/paired_TOM.Rdata")
module = "pink"
detailTable = pink_modmem[!is.na(pink_modmem$perc_80th),] #define 80th percentile
ExprSet = paired_expr
fraction="all"
modColors = modColors_paired
TOM_matrix = paired_TOM
subset <- detailTable %>%
mutate(geneID = str_c(ensg_version,symbol,sep="_")) %>%
dplyr::select(geneID)
#subset the large matrix to define what meets criteria
all_genes <- colnames(ExprSet)
inSubset = is.finite(match(all_genes, subset$geneID))
subset_ids = all_genes[inSubset];
subset_ids_gene <- gsub(pattern = ".*_","",subset_ids)
#which enables subsetting the TOM matrix to only what meets criteria
subset_TOM = TOM_matrix[inSubset, inSubset];
dimnames(subset_TOM) = list(subset_ids_gene, subset_ids_gene)
#perform export to Cytoscape
cyt = exportNetworkToCytoscape(subset_TOM,
edgeFile = paste("../../3_output/1_MASTERS_WGCNA/Cytoscape/GeneBiotype-edges-",
paste(module, collapse="-"), ".txt", sep=""),
nodeFile = paste("../../3_output/1_MASTERS_WGCNA/Cytoscape/GeneBiotype-nodes",
paste(module, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = subset_ids_gene,
altNodeNames = subset_ids_gene,
nodeAttr = modColors[inSubset]);
}
Calculate the change score for PLIER LVs.
plier_cbi <- as.data.frame(t(MASplier_updated$B))
plier_cbi_3 <- plier_cbi[str_detect(rownames(plier_cbi),"_3$"),]
plier_cbi_1 <- plier_cbi[str_detect(rownames(plier_cbi),"_1$"),] %>%
rownames_to_column(var = "SeqID") %>%
filter(str_sub(SeqID, 0, -3) %in% str_sub(rownames(plier_cbi_3),0,-3)) %>%
column_to_rownames(var = "SeqID")
all.equal(str_sub(rownames(plier_cbi_1),0,-3), str_sub(rownames(plier_cbi_3),0,-3)) #good
## [1] TRUE
calc_CBI <- 1 - (plier_cbi_1/plier_cbi_3)
to_bind <- pathdata_updated[,c("adj_delta_bilateral_thigh_normalized","adj_d_CT_area","adj_d_myofiber_CSA")]
to_bind <- to_bind%>%
filter(rownames(to_bind) %in% rownames(calc_CBI))
all.equal(rownames(to_bind), rownames(calc_CBI))
## [1] TRUE
pathdata_updated_plasticity <- cbind(calc_CBI, to_bind)
signifTable_plasticity_updated <- as.data.frame(matrix(nrow = nrow(MASplier_updated$B))) %>%
mutate(LV = rownames(MASplier_updated$B),adj_delta_bilateral_thigh_normalized = NA,
adj_d_CT_area = NA, adj_d_myofiber_CSA = NA) %>%
dplyr::select(-V1)
for(i in 1:25){
j = pathdata_updated_plasticity[,i]
fit1 <- lm(data = pathdata_updated_plasticity, formula = j ~ adj_delta_bilateral_thigh_normalized)
signifTable_plasticity_updated$adj_delta_bilateral_thigh_normalized[i]=round(summary(fit1)[["coefficients"]][2,4], digits=4)
fit2 <- lm(data = pathdata_updated_plasticity, formula = j ~ adj_d_CT_area)
signifTable_plasticity_updated$adj_d_CT_area[i]=round(summary(fit2)[["coefficients"]][2,4], digits=4)
fit3 <- lm(data = pathdata_updated_plasticity, formula = j ~ adj_d_myofiber_CSA)
signifTable_plasticity_updated$adj_d_myofiber_CSA[i]=round(summary(fit3)[["coefficients"]][2,4], digits=4)
}
LVs_of_interest_plastic_updated <- signifTable_plasticity_updated %>%
filter(adj_delta_bilateral_thigh_normalized <= 0.05 |
adj_d_CT_area <= 0.05 |
adj_d_myofiber_CSA <= 0.05)%>%
mutate(LVid = str_extract(LV, pattern="[:digit:]{1,}"))
LVs_of_interest_plastic_updated
We can see the associations and we will do something similar to the mods_of_interest table above (automating this is not necessary since it's only four).
LVs_plasticity <- as.data.frame(LVs_of_interest_plastic_updated$LVid) %>%
rename("LV" = "LVs_of_interest_plastic_updated$LVid") %>%
mutate(LV = as.numeric(LV)) %>%
mutate(trait = c("adj_d_myofiber_CSA","adj_d_myofiber_CSA","adj_delta_bilateral_thigh_normalized","adj_d_myofiber_CSA"))
for (i in 1:nrow(LVs_plasticity)){
print(ggplot(pathdata_updated_plasticity) +
geom_point(mapping=aes(x=pathdata_updated_plasticity[,LVs_plasticity$LV[i]],
y=pathdata_updated_plasticity[,names(pathdata_updated_plasticity)==LVs_plasticity$trait[i]]),
colour="black", size = 3) +
geom_smooth(mapping=aes(x=pathdata_updated_plasticity[,LVs_plasticity$LV[i]],
y=pathdata_updated_plasticity[,names(pathdata_updated_plasticity)==LVs_plasticity$trait[i]]),
method = "lm", colour = "black") +
scale_y_continuous(name = LVs_plasticity$trait[i]) +
scale_x_continuous(name = str_c("LV",LVs_plasticity$LV[i])) +
theme_classic()+
theme(axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.ticks.length = unit(10, "pt")))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Clearly, some interesting patterns are obvious. Those that do not appear to have good spread around the x axis will likely be lost during sensitivity checks. Below, we explore this.
The resulting table is the final column of manuscript Table 2 (Plasticity LVs). Because no LVs were related to CT, we will only focus on the secondary DXA outcome and the exploratory myofiber CSA outcome.
names(pathdata_updated_plasticity)[1:25] <- str_c("LV",c(1:25))
outlier_check_plasticity <- as.data.frame(matrix(ncol = 6, nrow=25))
names(outlier_check_plasticity) <- c("LV","n", "r_val_adj_delta_bilateral_thigh_normalized",
"p_val_adj_delta_bilateral_thigh_normalized", "r_val_adj_d_myofiber_CSA",
"p_val_adj_d_myofiber_CSA")
for (i in 1:25){
name = str_c("LV",i)
outlier_check_plasticity$LV[i] = name
clean_path <- pathdata_updated_plasticity %>% #drops anyone above and below 3 standard deviations
filter(pathdata_updated_plasticity[,name]<= mean(pathdata_updated_plasticity[,name])+
3*sd(pathdata_updated_plasticity[,name]) &
pathdata_updated_plasticity[,name]>= mean(pathdata_updated_plasticity[,name])-
3*sd(pathdata_updated_plasticity[,name]))
fit <- lm(data = clean_path, clean_path[,name] ~adj_delta_bilateral_thigh_normalized)
outlier_check_plasticity$r_val_adj_delta_bilateral_thigh_normalized[i] <- round(summary(fit)$r.squared,3)
outlier_check_plasticity$p_val_adj_delta_bilateral_thigh_normalized[i] <- round(summary(fit)[["coefficients"]][2,4],3)
outlier_check_plasticity$n[i] <- nrow(clean_path)
fit2 <- lm(data = clean_path, clean_path[,name] ~adj_d_myofiber_CSA)
outlier_check_plasticity$r_val_adj_d_myofiber_CSA[i] <- round(summary(fit2)$r.squared,3)
outlier_check_plasticity$p_val_adj_d_myofiber_CSA[i] <- round(summary(fit2)[["coefficients"]][2,4],3)
}
#present LVs that survive the check for CT and DXA; write to a variable so we can use it later
LV_sensitivity_check_plasticity <- outlier_check_plasticity %>%
filter(LV %in% str_c("LV",LVs_of_interest_plastic_updated$LVid)) %>%
filter(p_val_adj_d_myofiber_CSA <.05 | p_val_adj_delta_bilateral_thigh_normalized<0.05)
LV_sensitivity_check_plasticity
There is just one: LV20.
Unfortunately, I don't think we can plot the gene-by-gene heatmap for this LV. Much like the pink scatterplot above, we cannot generate the same effect since we have created change scores from the expressoin data. The resulting heatmap is not very informative. Instead, just export the top 100 genes and present in Table S18.
LVtablemaker(plierRes = MASplier_updated, data = newyN, priorMat = allPaths, index = 20, top = 100,
path = "../3_output/2_MASTERS_PLIER/LVtables")
The PLIER analysis is used to annotate top genes in the WGCNA Plasticity analysis pink module (Table S17).
pinkPLIER <- mod_LV_check(plierResult = MASplier_updated,
genetable = read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/paired/pinkGeneTable.csv"))
summary(as.factor(gsub(".*,","",pinkPLIER$LV_path)))
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
## 4
## LV 10
## 1
## MetaMEx_AcuteResistance
## 3
## MetaMEx_AerobicTraining
## 2
## MetaMEx_ResistanceTraining
## 49
## Type_I_markers
## 3
#write.csv(pinkPLIER, "../../3_output/1_MASTERS_WGCNA/Cytoscape/PLIER_connections_pink.csv")
We cannot test for overlapping "key" genes because the networks are not related to the same outcome metrics.
We will merge the prediction and response LVs for presentation. Prepare for merging from tables produced along the way.
LV_Table2_prediction <- LV_sensitivity_check %>%
dplyr::select(-n, -starts_with("r_val")) %>%
rename("prediction_adj_d_CT_area"="p_val_adj_d_CT_area",
"prediction_adj_delta_bilateral_thigh_normalized" = "p_val_adj_delta_bilateral_thigh_normalized") %>%
mutate(prediction_adj_d_CT_area = ifelse(prediction_adj_d_CT_area<=0.05, prediction_adj_d_CT_area, NA),
prediction_adj_delta_bilateral_thigh_normalized = ifelse(prediction_adj_delta_bilateral_thigh_normalized<=0.05, prediction_adj_delta_bilateral_thigh_normalized, NA))
LV_Table2_response <- LV_sensitivity_check_response %>%
mutate(LV= paste0("LV", LVid)) %>%
dplyr::select(-LVid) %>% rename("response_pre_vs_post" = "adj_p_RTeffect")
LV_Table2_plasticity <- LV_sensitivity_check_plasticity %>%
dplyr::select(-n, -starts_with("r_val"), -contains("myofiber")) %>%
rename("plasticity_adj_delta_bilateral_thigh_normalized"= "p_val_adj_delta_bilateral_thigh_normalized")
full_join(LV_Table2_prediction, LV_Table2_response, by = "LV") %>%
full_join(LV_Table2_plasticity) %>%
arrange(as.numeric(gsub("LV","",LV)))
## Joining, by = "LV"
In order to help us interpret some of the overlaps and pathway annotations, we need to go back to the original MetaMEx data outputs and examine the directions of these. Let's start with resistance training.
RT_Stats <- readRDS("/data/user/kmlavin/PLIER/0_data/training_resistance.Rds")
processed_data <- RT_Stats %>%
dplyr::select(contains("_HLY_"))%>%
mutate(gene = rownames(RT_Stats)) %>%
relocate(gene, .before = colnames(RT_Stats)[1]) %>%
pivot_longer(-gene, names_to ="studyID", values_to = "value") %>%
separate(studyID, into = c("stat", "GEO", "exercise", "mode", "muscle", "sex",
"age", "trained", "lean","healthy","hour","week"),sep = "_") %>%
filter(stat == "logFC" | stat == "adj.P.Val")
RT_gene_data <- pivot_wider(processed_data, names_from = stat, values_from = value)
load("/data/user/kmlavin/PLIER/2_pipeline/store/MetaMExMatrix.Rdata")
load("../../2_pipeline/1_MASTERS_WGCNA/store/MetaMEx_directional.Rdata")
RT_up <- RT_gene_data %>%
filter((logFC > 1) & (adj.P.Val <= 0.05))
RT_down <- RT_gene_data %>%
filter((logFC < -1) & (adj.P.Val <= 0.05))
RT_up_genes <- unique(RT_up$gene)
RT_down_genes <- unique(RT_down$gene)
RT_common <- unique(c(RT_up_genes, RT_down_genes))
all(RT_common %in% RT_geneset) #matches input that would have gone into PLIER
## [1] TRUE
We will look for overlaps with the up vs. down lists to see the general direction of the patterns.
#load those that are not in environment
pinkgenes <- read.csv(file="../../3_output/1_MASTERS_WGCNA/GeneTables/paired/pinkGeneTable.csv")
salmon_gene_table <- read.csv("../../3_output/1_MASTERS_WGCNA/GeneTables/basal/salmonGeneTable.csv")
red_RT_up <- IDoverlaps(red_gene_table$symbol, RT_up_genes) #23
red_RT_down <- IDoverlaps(red_gene_table$symbol, RT_down_genes) #3
magenta_RT_up <- IDoverlaps(magenta_gene_table$symbol, RT_up_genes) #16
magenta_RT_down <- IDoverlaps(magenta_gene_table$symbol, RT_down_genes) #3
salmon_RT_up <- IDoverlaps(salmon_gene_table$symbol, RT_up_genes) #27
salmon_RT_down <- IDoverlaps(salmon_gene_table$symbol, RT_down_genes) #0
pink_RT_up <- IDoverlaps(pinkgenes$symbol, RT_up_genes) #18
pink_RT_down <- IDoverlaps(pinkgenes$symbol, RT_down_genes) #1
Aerobic training popped out a bit in the analysis too so it would be a good idea to have the lists.
AT_Stats <- readRDS("/data/user/kmlavin/PLIER/0_data/training_aerobic.Rds")
processed_data <- AT_Stats %>%
dplyr::select(contains("_HLY_"))%>%
mutate(gene = rownames(AT_Stats)) %>%
relocate(gene, .before = colnames(AT_Stats)[1]) %>%
pivot_longer(-gene, names_to ="studyID", values_to = "value") %>%
separate(studyID, into = c("stat", "GEO", "exercise", "mode", "muscle", "sex",
"age", "trained", "lean","healthy","hour","week"),sep = "_") %>%
filter(stat == "logFC" | stat == "adj.P.Val")
AT_gene_data <- pivot_wider(processed_data, names_from = stat, values_from = value)
AT_up <- AT_gene_data %>%
filter((logFC > 1) & (adj.P.Val <= 0.05))
AT_down <- AT_gene_data %>%
filter((logFC < -1) & (adj.P.Val <= 0.05))
AT_up_genes <- unique(AT_up$gene)
AT_down_genes <- unique(AT_down$gene)
AT_common <- unique(c(AT_up_genes, AT_down_genes))
all(AT_common %in% AT_geneset) #matches input that would have gone into PLIER
## [1] TRUE
red_AT_up <- IDoverlaps(red_gene_table$symbol, AT_up_genes) #14
red_AT_down <- IDoverlaps(red_gene_table$symbol, AT_down_genes) #3
magenta_AT_up <- IDoverlaps(magenta_gene_table$symbol, AT_up_genes) #9
magenta_AT_down <- IDoverlaps(magenta_gene_table$symbol, AT_down_genes) #2
salmon_AT_up <- IDoverlaps(salmon_gene_table$symbol, AT_up_genes) #3
salmon_AT_down <- IDoverlaps(salmon_gene_table$symbol, AT_down_genes) #1
pink_AT_up <- IDoverlaps(pinkgenes$symbol, AT_up_genes) #7
pink_AT_down <- IDoverlaps(pinkgenes$symbol, AT_down_genes) #3
These were manually compiled to produce Table S15.
A clear question is whether any of the genes we found in the red and magenta modules are in any way similar to what was driving the responder status in the Thalacker Mercer 2013 publication. Let's pull in data from the supplement.
TMgenes <- read.csv("../../0_data/TM_gene_fold_differences.csv") %>%
rename("Extr_Non" = "Xt_Non",
"Extr_Mod" = "Xtr_Mod") %>%
dplyr::select(-starts_with("X"))
TMgenes$Extr_Non[TMgenes$Extr_Non=="ND"] <- NA
TMgenes$Extr_Mod[TMgenes$Extr_Mod=="ND"] <- NA
TMgenes$Mod_Non[TMgenes$Mod_Non=="ND"] <- NA #might be convenient to have them as these
We will also collapse the red and magenta modules, both of which were associated with the whole-muscle hypertrophy outcomes of interest to at least a nominal degree.
current_genes <- c(red_gene_table$symbol, magenta_gene_table$symbol)
microarray_genes <- TMgenes$Gene
show <- IDoverlaps(current_genes, microarray_genes)
TMgenes[TMgenes$Gene %in% show,]
Only three are the same, but the directionality seems different. DCN and TGFB1 are actually lower in responders in the other paper. LAMA1 is following the same pattern, but only in Extr vs. Mod.
It may be of interest to collapse the PLIER LVs that were most associated with the changes.
all_CT_genes <- unique(c(LV7$X, LV16$X, LV23$X))
show <- IDoverlaps(all_CT_genes, microarray_genes)
TMgenes[TMgenes$Gene %in% show,]
Here there are three, and two are in the same direction this time. We interpret this in the Discussion, keeping in mind that both LV7 and LV23 actually have negative relationships with hypetrophy.
"RELA" %in% LV23$X #completely lucky guess
## [1] FALSE
Thus all three genes that overlap are in fact in the same direction.
What about the LV that syncs up with DXA?
LV19 <- read.csv("../../3_output/2_MASTERS_PLIER/SensitivityCheck/LVtables/DXA/LV19_table_Top100.csv")
show <- IDoverlaps(LV19$X, microarray_genes)
TMgenes[TMgenes$Gene %in% show,]
Here we see two genes, going in opposite direction. IGF2 might be an interesting one to follow up, but overall there is rather little overlap.
All preliminary code can be found in: /data/project/bamman-lab/MASTERS/1_code Subdirectories for WGCNA and PLIER. BE sure to see PLIER sensitivity check in /MASTERS/2_pipeline/2_MASTERS_PLIER/MASTERS_PLIER_senscheck.Rmd
listofthings <- ls()
write.csv(listofthings, file = "../../5_manuscript/R_environment_variables.csv")
For this paper, the majority of samples have already been uploaded to GEO. We included 4 others that were not part of the original data set. Therefore, we will subset to those four for relevant phenotypes and make sure that we create a featureCounts file that only has those. Update: GEO actually wants all of the included samples to be part of this analysis. We need to export the data set for all 53 samples, and then they will pull the FASTQs from the original posting, in accession number GSE157585
pheno.export <- pheno.long %>%
#filter(SeqID %in% c("P_5_1", "P_5_3", "P_8_1", "P_8_3")) %>%
dplyr::select(SeqID, Age, Sex) %>%
mutate(Timepoint = str_sub(SeqID, -1))
keepcounts <- c("P_5_1", "P_5_3", "P_8_1", "P_8_3")
featureCounts <- cts.updated %>%
dplyr::select(all_of(keepcounts))
#write.table(featureCounts, file="../../5_manuscript/GEO/featureCounts.txt")
#write.csv(pheno.export, file="../../5_manuscript/GEO/subjects.csv")
md5 sums were created in linux using the following commands:
cd /data/project/bamman-lab/MASTERS/0_data/0_preprocessing
find ../FASTQ/ -name "*.fq.*" -exec md5sum {} \; > ../FASTQ/GEOmd5.list8c709d0 featureCounts.txt
...and the md5 list file was manually exported to my desktop and used to fill in the Excel sheet.
For the others, we need to pull the GSM number from GEO.
gsm <- read.csv("../../5_manuscript/GEO/GSE157585.csv", header = FALSE)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on '../../5_manuscript/GEO/
## GSE157585.csv'
gsm <- t(gsm)
gsm <- as.data.frame(gsm)
names(gsm) <- c("ID","GSM")
gsm_new <- gsm %>%
mutate(SeqID = str_extract(ID, pattern = "(?#:)[:graph:]{1,}")) %>%
mutate(SeqID = gsub(pattern = ":", replacement = "", SeqID)) %>%
filter(SeqID %in% names(cts.updated))
gsm_new
n_distinct(gsm_new$GSM)
## [1] 49
key <- gsm_new %>%
dplyr::select(SeqID, GSM)
addthese <- names(cts.updated[!names(cts.updated) %in% key$SeqID])
add.these <- as.data.frame(addthese) %>%
mutate(GSM = addthese) %>%
rename("SeqID" = addthese)
key2 <- rbind(add.these, key)
processed_data <- cts.updated %>%
rename_with(~str_replace_all(., key2$SeqID, key2$GSM))
#write.table(processed_data, file="../../5_manuscript/GEO/featureCounts.txt")